From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/aom/av1/common/alloccommon.c | 506 + third_party/aom/av1/common/alloccommon.h | 65 + third_party/aom/av1/common/arm/av1_inv_txfm_neon.c | 4217 ++++++ third_party/aom/av1/common/arm/av1_inv_txfm_neon.h | 154 + third_party/aom/av1/common/arm/av1_txfm_neon.c | 30 + .../aom/av1/common/arm/blend_a64_hmask_neon.c | 102 + .../aom/av1/common/arm/blend_a64_vmask_neon.c | 112 + third_party/aom/av1/common/arm/cdef_block_neon.c | 1355 ++ third_party/aom/av1/common/arm/cfl_neon.c | 589 + .../aom/av1/common/arm/compound_convolve_neon.c | 2719 ++++ .../aom/av1/common/arm/compound_convolve_neon.h | 1164 ++ .../common/arm/compound_convolve_neon_dotprod.c | 675 + .../av1/common/arm/compound_convolve_neon_i8mm.c | 614 + third_party/aom/av1/common/arm/convolve_neon.c | 1659 +++ third_party/aom/av1/common/arm/convolve_neon.h | 538 + .../aom/av1/common/arm/convolve_neon_dotprod.c | 793 ++ .../aom/av1/common/arm/convolve_neon_i8mm.c | 702 + .../av1/common/arm/highbd_compound_convolve_neon.c | 2031 +++ .../av1/common/arm/highbd_convolve_horiz_rs_neon.c | 273 + .../aom/av1/common/arm/highbd_convolve_neon.c | 2120 +++ .../aom/av1/common/arm/highbd_convolve_neon.h | 148 + .../av1/common/arm/highbd_convolve_scale_neon.c | 552 + .../aom/av1/common/arm/highbd_inv_txfm_neon.c | 5994 +++++++++ .../aom/av1/common/arm/highbd_reconinter_neon.c | 327 + .../aom/av1/common/arm/highbd_reconintra_neon.c | 241 + .../aom/av1/common/arm/highbd_warp_plane_neon.c | 317 + .../aom/av1/common/arm/highbd_warp_plane_neon.h | 424 + .../av1/common/arm/highbd_wiener_convolve_neon.c | 403 + third_party/aom/av1/common/arm/reconinter_neon.c | 217 + third_party/aom/av1/common/arm/reconintra_neon.c | 392 + third_party/aom/av1/common/arm/resize_neon.c | 1178 ++ third_party/aom/av1/common/arm/selfguided_neon.c | 1595 +++ third_party/aom/av1/common/arm/warp_plane_neon.c | 276 + third_party/aom/av1/common/arm/warp_plane_neon.h | 367 + .../aom/av1/common/arm/warp_plane_neon_i8mm.c | 291 + third_party/aom/av1/common/arm/warp_plane_sve.c | 284 + .../aom/av1/common/arm/wiener_convolve_neon.c | 348 + third_party/aom/av1/common/av1_common_int.h | 1882 +++ third_party/aom/av1/common/av1_inv_txfm1d.c | 1841 +++ third_party/aom/av1/common/av1_inv_txfm1d.h | 61 + third_party/aom/av1/common/av1_inv_txfm1d_cfg.h | 45 + third_party/aom/av1/common/av1_inv_txfm2d.c | 484 + third_party/aom/av1/common/av1_loopfilter.c | 2099 +++ third_party/aom/av1/common/av1_loopfilter.h | 150 + third_party/aom/av1/common/av1_rtcd.c | 18 + third_party/aom/av1/common/av1_rtcd_defs.pl | 655 + third_party/aom/av1/common/av1_txfm.c | 278 + third_party/aom/av1/common/av1_txfm.h | 256 + third_party/aom/av1/common/blockd.c | 100 + third_party/aom/av1/common/blockd.h | 1612 +++ third_party/aom/av1/common/cdef.c | 466 + third_party/aom/av1/common/cdef.h | 112 + third_party/aom/av1/common/cdef_block.c | 426 + third_party/aom/av1/common/cdef_block.h | 65 + third_party/aom/av1/common/cdef_block_simd.h | 844 ++ third_party/aom/av1/common/cfl.c | 434 + third_party/aom/av1/common/cfl.h | 294 + third_party/aom/av1/common/common.h | 61 + third_party/aom/av1/common/common_data.c | 43 + third_party/aom/av1/common/common_data.h | 432 + third_party/aom/av1/common/convolve.c | 1508 +++ third_party/aom/av1/common/convolve.h | 132 + third_party/aom/av1/common/debugmodes.c | 113 + third_party/aom/av1/common/entropy.c | 178 + third_party/aom/av1/common/entropy.h | 182 + third_party/aom/av1/common/entropymode.c | 1094 ++ third_party/aom/av1/common/entropymode.h | 218 + third_party/aom/av1/common/entropymv.c | 67 + third_party/aom/av1/common/entropymv.h | 104 + third_party/aom/av1/common/enums.h | 651 + third_party/aom/av1/common/filter.h | 320 + third_party/aom/av1/common/frame_buffers.c | 98 + third_party/aom/av1/common/frame_buffers.h | 60 + third_party/aom/av1/common/idct.c | 322 + third_party/aom/av1/common/idct.h | 51 + third_party/aom/av1/common/mv.h | 337 + third_party/aom/av1/common/mvref_common.c | 1501 +++ third_party/aom/av1/common/mvref_common.h | 342 + third_party/aom/av1/common/obmc.h | 89 + third_party/aom/av1/common/obu_util.c | 133 + third_party/aom/av1/common/obu_util.h | 47 + third_party/aom/av1/common/ppc/cfl_ppc.c | 152 + third_party/aom/av1/common/pred_common.c | 501 + third_party/aom/av1/common/pred_common.h | 377 + third_party/aom/av1/common/quant_common.c | 12876 +++++++++++++++++++ third_party/aom/av1/common/quant_common.h | 84 + third_party/aom/av1/common/reconinter.c | 1169 ++ third_party/aom/av1/common/reconinter.h | 489 + third_party/aom/av1/common/reconinter_template.inc | 267 + third_party/aom/av1/common/reconintra.c | 1798 +++ third_party/aom/av1/common/reconintra.h | 158 + third_party/aom/av1/common/resize.c | 1452 +++ third_party/aom/av1/common/resize.h | 146 + third_party/aom/av1/common/restoration.c | 1494 +++ third_party/aom/av1/common/restoration.h | 471 + third_party/aom/av1/common/scale.c | 57 + third_party/aom/av1/common/scale.h | 87 + third_party/aom/av1/common/scan.c | 2038 +++ third_party/aom/av1/common/scan.h | 54 + third_party/aom/av1/common/seg_common.c | 91 + third_party/aom/av1/common/seg_common.h | 113 + third_party/aom/av1/common/thread_common.c | 1250 ++ third_party/aom/av1/common/thread_common.h | 345 + third_party/aom/av1/common/tile_common.c | 249 + third_party/aom/av1/common/tile_common.h | 75 + third_party/aom/av1/common/timing.c | 92 + third_party/aom/av1/common/timing.h | 55 + third_party/aom/av1/common/token_cdfs.h | 3555 +++++ third_party/aom/av1/common/txb_common.c | 364 + third_party/aom/av1/common/txb_common.h | 463 + third_party/aom/av1/common/warped_motion.c | 918 ++ third_party/aom/av1/common/warped_motion.h | 97 + .../av1/common/x86/av1_convolve_horiz_rs_sse4.c | 228 + .../aom/av1/common/x86/av1_convolve_scale_sse4.c | 498 + third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | 2254 ++++ third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h | 71 + .../aom/av1/common/x86/av1_inv_txfm_ssse3.c | 2904 +++++ .../aom/av1/common/x86/av1_inv_txfm_ssse3.h | 247 + third_party/aom/av1/common/x86/av1_txfm_sse2.h | 321 + third_party/aom/av1/common/x86/av1_txfm_sse4.c | 22 + third_party/aom/av1/common/x86/av1_txfm_sse4.h | 72 + third_party/aom/av1/common/x86/cdef_block_avx2.c | 357 + third_party/aom/av1/common/x86/cdef_block_sse2.c | 40 + third_party/aom/av1/common/x86/cdef_block_sse4.c | 40 + third_party/aom/av1/common/x86/cdef_block_ssse3.c | 40 + third_party/aom/av1/common/x86/cfl_avx2.c | 495 + third_party/aom/av1/common/x86/cfl_simd.h | 246 + third_party/aom/av1/common/x86/cfl_sse2.c | 89 + third_party/aom/av1/common/x86/cfl_ssse3.c | 397 + third_party/aom/av1/common/x86/convolve_2d_avx2.c | 161 + third_party/aom/av1/common/x86/convolve_2d_sse2.c | 547 + third_party/aom/av1/common/x86/convolve_avx2.c | 916 ++ third_party/aom/av1/common/x86/convolve_sse2.c | 500 + third_party/aom/av1/common/x86/filterintra_sse4.c | 350 + .../aom/av1/common/x86/highbd_convolve_2d_avx2.c | 200 + .../aom/av1/common/x86/highbd_convolve_2d_sse4.c | 421 + .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c | 414 + .../aom/av1/common/x86/highbd_inv_txfm_avx2.c | 4239 ++++++ .../aom/av1/common/x86/highbd_inv_txfm_sse4.c | 5830 +++++++++ .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c | 849 ++ .../aom/av1/common/x86/highbd_jnt_convolve_sse4.c | 381 + .../aom/av1/common/x86/highbd_txfm_utility_sse4.h | 132 + .../aom/av1/common/x86/highbd_warp_affine_avx2.c | 656 + .../aom/av1/common/x86/highbd_warp_plane_sse4.c | 636 + .../av1/common/x86/highbd_wiener_convolve_avx2.c | 245 + .../av1/common/x86/highbd_wiener_convolve_ssse3.c | 202 + third_party/aom/av1/common/x86/intra_edge_sse4.c | 322 + third_party/aom/av1/common/x86/jnt_convolve_avx2.c | 1124 ++ third_party/aom/av1/common/x86/jnt_convolve_sse2.c | 606 + .../aom/av1/common/x86/jnt_convolve_ssse3.c | 230 + third_party/aom/av1/common/x86/reconinter_avx2.c | 624 + third_party/aom/av1/common/x86/reconinter_sse4.c | 154 + third_party/aom/av1/common/x86/reconinter_ssse3.c | 120 + third_party/aom/av1/common/x86/resize_ssse3.c | 974 ++ third_party/aom/av1/common/x86/selfguided_avx2.c | 724 ++ third_party/aom/av1/common/x86/selfguided_sse4.c | 662 + third_party/aom/av1/common/x86/warp_plane_avx2.c | 1210 ++ third_party/aom/av1/common/x86/warp_plane_sse4.c | 908 ++ .../aom/av1/common/x86/wiener_convolve_avx2.c | 242 + .../aom/av1/common/x86/wiener_convolve_sse2.c | 199 + 160 files changed, 118413 insertions(+) create mode 100644 third_party/aom/av1/common/alloccommon.c create mode 100644 third_party/aom/av1/common/alloccommon.h create mode 100644 third_party/aom/av1/common/arm/av1_inv_txfm_neon.c create mode 100644 third_party/aom/av1/common/arm/av1_inv_txfm_neon.h create mode 100644 third_party/aom/av1/common/arm/av1_txfm_neon.c create mode 100644 third_party/aom/av1/common/arm/blend_a64_hmask_neon.c create mode 100644 third_party/aom/av1/common/arm/blend_a64_vmask_neon.c create mode 100644 third_party/aom/av1/common/arm/cdef_block_neon.c create mode 100644 third_party/aom/av1/common/arm/cfl_neon.c create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon.h create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c create mode 100644 third_party/aom/av1/common/arm/convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/convolve_neon.h create mode 100644 third_party/aom/av1/common/arm/convolve_neon_dotprod.c create mode 100644 third_party/aom/av1/common/arm/convolve_neon_i8mm.c create mode 100644 third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_neon.h create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_reconinter_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_reconintra_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_warp_plane_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_warp_plane_neon.h create mode 100644 third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/reconinter_neon.c create mode 100644 third_party/aom/av1/common/arm/reconintra_neon.c create mode 100644 third_party/aom/av1/common/arm/resize_neon.c create mode 100644 third_party/aom/av1/common/arm/selfguided_neon.c create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.c create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.h create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c create mode 100644 third_party/aom/av1/common/arm/warp_plane_sve.c create mode 100644 third_party/aom/av1/common/arm/wiener_convolve_neon.c create mode 100644 third_party/aom/av1/common/av1_common_int.h create mode 100644 third_party/aom/av1/common/av1_inv_txfm1d.c create mode 100644 third_party/aom/av1/common/av1_inv_txfm1d.h create mode 100644 third_party/aom/av1/common/av1_inv_txfm1d_cfg.h create mode 100644 third_party/aom/av1/common/av1_inv_txfm2d.c create mode 100644 third_party/aom/av1/common/av1_loopfilter.c create mode 100644 third_party/aom/av1/common/av1_loopfilter.h create mode 100644 third_party/aom/av1/common/av1_rtcd.c create mode 100644 third_party/aom/av1/common/av1_rtcd_defs.pl create mode 100644 third_party/aom/av1/common/av1_txfm.c create mode 100644 third_party/aom/av1/common/av1_txfm.h create mode 100644 third_party/aom/av1/common/blockd.c create mode 100644 third_party/aom/av1/common/blockd.h create mode 100644 third_party/aom/av1/common/cdef.c create mode 100644 third_party/aom/av1/common/cdef.h create mode 100644 third_party/aom/av1/common/cdef_block.c create mode 100644 third_party/aom/av1/common/cdef_block.h create mode 100644 third_party/aom/av1/common/cdef_block_simd.h create mode 100644 third_party/aom/av1/common/cfl.c create mode 100644 third_party/aom/av1/common/cfl.h create mode 100644 third_party/aom/av1/common/common.h create mode 100644 third_party/aom/av1/common/common_data.c create mode 100644 third_party/aom/av1/common/common_data.h create mode 100644 third_party/aom/av1/common/convolve.c create mode 100644 third_party/aom/av1/common/convolve.h create mode 100644 third_party/aom/av1/common/debugmodes.c create mode 100644 third_party/aom/av1/common/entropy.c create mode 100644 third_party/aom/av1/common/entropy.h create mode 100644 third_party/aom/av1/common/entropymode.c create mode 100644 third_party/aom/av1/common/entropymode.h create mode 100644 third_party/aom/av1/common/entropymv.c create mode 100644 third_party/aom/av1/common/entropymv.h create mode 100644 third_party/aom/av1/common/enums.h create mode 100644 third_party/aom/av1/common/filter.h create mode 100644 third_party/aom/av1/common/frame_buffers.c create mode 100644 third_party/aom/av1/common/frame_buffers.h create mode 100644 third_party/aom/av1/common/idct.c create mode 100644 third_party/aom/av1/common/idct.h create mode 100644 third_party/aom/av1/common/mv.h create mode 100644 third_party/aom/av1/common/mvref_common.c create mode 100644 third_party/aom/av1/common/mvref_common.h create mode 100644 third_party/aom/av1/common/obmc.h create mode 100644 third_party/aom/av1/common/obu_util.c create mode 100644 third_party/aom/av1/common/obu_util.h create mode 100644 third_party/aom/av1/common/ppc/cfl_ppc.c create mode 100644 third_party/aom/av1/common/pred_common.c create mode 100644 third_party/aom/av1/common/pred_common.h create mode 100644 third_party/aom/av1/common/quant_common.c create mode 100644 third_party/aom/av1/common/quant_common.h create mode 100644 third_party/aom/av1/common/reconinter.c create mode 100644 third_party/aom/av1/common/reconinter.h create mode 100644 third_party/aom/av1/common/reconinter_template.inc create mode 100644 third_party/aom/av1/common/reconintra.c create mode 100644 third_party/aom/av1/common/reconintra.h create mode 100644 third_party/aom/av1/common/resize.c create mode 100644 third_party/aom/av1/common/resize.h create mode 100644 third_party/aom/av1/common/restoration.c create mode 100644 third_party/aom/av1/common/restoration.h create mode 100644 third_party/aom/av1/common/scale.c create mode 100644 third_party/aom/av1/common/scale.h create mode 100644 third_party/aom/av1/common/scan.c create mode 100644 third_party/aom/av1/common/scan.h create mode 100644 third_party/aom/av1/common/seg_common.c create mode 100644 third_party/aom/av1/common/seg_common.h create mode 100644 third_party/aom/av1/common/thread_common.c create mode 100644 third_party/aom/av1/common/thread_common.h create mode 100644 third_party/aom/av1/common/tile_common.c create mode 100644 third_party/aom/av1/common/tile_common.h create mode 100644 third_party/aom/av1/common/timing.c create mode 100644 third_party/aom/av1/common/timing.h create mode 100644 third_party/aom/av1/common/token_cdfs.h create mode 100644 third_party/aom/av1/common/txb_common.c create mode 100644 third_party/aom/av1/common/txb_common.h create mode 100644 third_party/aom/av1/common/warped_motion.c create mode 100644 third_party/aom/av1/common/warped_motion.h create mode 100644 third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c create mode 100644 third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse2.h create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse4.c create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse4.h create mode 100644 third_party/aom/av1/common/x86/cdef_block_avx2.c create mode 100644 third_party/aom/av1/common/x86/cdef_block_sse2.c create mode 100644 third_party/aom/av1/common/x86/cdef_block_sse4.c create mode 100644 third_party/aom/av1/common/x86/cdef_block_ssse3.c create mode 100644 third_party/aom/av1/common/x86/cfl_avx2.c create mode 100644 third_party/aom/av1/common/x86/cfl_simd.h create mode 100644 third_party/aom/av1/common/x86/cfl_sse2.c create mode 100644 third_party/aom/av1/common/x86/cfl_ssse3.c create mode 100644 third_party/aom/av1/common/x86/convolve_2d_avx2.c create mode 100644 third_party/aom/av1/common/x86/convolve_2d_sse2.c create mode 100644 third_party/aom/av1/common/x86/convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/convolve_sse2.c create mode 100644 third_party/aom/av1/common/x86/filterintra_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c create mode 100644 third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h create mode 100644 third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c create mode 100644 third_party/aom/av1/common/x86/intra_edge_sse4.c create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_sse2.c create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_ssse3.c create mode 100644 third_party/aom/av1/common/x86/reconinter_avx2.c create mode 100644 third_party/aom/av1/common/x86/reconinter_sse4.c create mode 100644 third_party/aom/av1/common/x86/reconinter_ssse3.c create mode 100644 third_party/aom/av1/common/x86/resize_ssse3.c create mode 100644 third_party/aom/av1/common/x86/selfguided_avx2.c create mode 100644 third_party/aom/av1/common/x86/selfguided_sse4.c create mode 100644 third_party/aom/av1/common/x86/warp_plane_avx2.c create mode 100644 third_party/aom/av1/common/x86/warp_plane_sse4.c create mode 100644 third_party/aom/av1/common/x86/wiener_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/wiener_convolve_sse2.c (limited to 'third_party/aom/av1/common') diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c new file mode 100644 index 0000000000..2a9a8beb40 --- /dev/null +++ b/third_party/aom/av1/common/alloccommon.c @@ -0,0 +1,506 @@ +/* + * + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom_mem/aom_mem.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/cdef_block.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/thread_common.h" + +int av1_get_MBs(int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); + const int mi_cols = aligned_width >> MI_SIZE_LOG2; + const int mi_rows = aligned_height >> MI_SIZE_LOG2; + + const int mb_cols = ROUND_POWER_OF_TWO(mi_cols, 2); + const int mb_rows = ROUND_POWER_OF_TWO(mi_rows, 2); + return mb_rows * mb_cols; +} + +void av1_free_ref_frame_buffers(BufferPool *pool) { + int i; + + for (i = 0; i < pool->num_frame_bufs; ++i) { + if (pool->frame_bufs[i].ref_count > 0 && + pool->frame_bufs[i].raw_frame_buffer.data != NULL) { + pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); + pool->frame_bufs[i].raw_frame_buffer.data = NULL; + pool->frame_bufs[i].raw_frame_buffer.size = 0; + pool->frame_bufs[i].raw_frame_buffer.priv = NULL; + pool->frame_bufs[i].ref_count = 0; + } + aom_free(pool->frame_bufs[i].mvs); + pool->frame_bufs[i].mvs = NULL; + aom_free(pool->frame_bufs[i].seg_map); + pool->frame_bufs[i].seg_map = NULL; + aom_free_frame_buffer(&pool->frame_bufs[i].buf); + } + aom_free(pool->frame_bufs); + pool->frame_bufs = NULL; + pool->num_frame_bufs = 0; +} + +static INLINE void free_cdef_linebuf_conditional( + AV1_COMMON *const cm, const size_t *new_linebuf_size) { + CdefInfo *cdef_info = &cm->cdef_info; + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) { + aom_free(cdef_info->linebuf[plane]); + cdef_info->linebuf[plane] = NULL; + } + } +} + +static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm, + uint16_t **colbuf, + uint16_t **srcbuf, + const size_t *new_colbuf_size, + const size_t new_srcbuf_size) { + CdefInfo *cdef_info = &cm->cdef_info; + if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) { + aom_free(*srcbuf); + *srcbuf = NULL; + } + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) { + aom_free(colbuf[plane]); + colbuf[plane] = NULL; + } + } +} + +static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) { + aom_free(*srcbuf); + *srcbuf = NULL; + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + aom_free(colbuf[plane]); + colbuf[plane] = NULL; + } +} + +static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt, + const int num_mi_rows) { + if (*cdef_row_mt == NULL) return; +#if CONFIG_MULTITHREAD + for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { + if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) { + pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); + aom_free((*cdef_row_mt)[row_idx].row_mutex_); + } + if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) { + pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); + aom_free((*cdef_row_mt)[row_idx].row_cond_); + } + } +#else + (void)num_mi_rows; +#endif // CONFIG_MULTITHREAD + aom_free(*cdef_row_mt); + *cdef_row_mt = NULL; +} + +void av1_free_cdef_buffers(AV1_COMMON *const cm, + AV1CdefWorkerData **cdef_worker, + AV1CdefSync *cdef_sync) { + CdefInfo *cdef_info = &cm->cdef_info; + const int num_mi_rows = cdef_info->allocated_mi_rows; + + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + aom_free(cdef_info->linebuf[plane]); + cdef_info->linebuf[plane] = NULL; + } + // De-allocation of column buffer & source buffer (worker_0). + free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf); + + free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows); + + if (cdef_info->allocated_num_workers < 2) return; + if (*cdef_worker != NULL) { + for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) { + // De-allocation of column buffer & source buffer for remaining workers. + free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); + } + aom_free(*cdef_worker); + *cdef_worker = NULL; + } +} + +static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf, + const int num_planes) { + CdefInfo *cdef_info = &cm->cdef_info; + for (int plane = 0; plane < num_planes; plane++) { + if (linebuf[plane] == NULL) + CHECK_MEM_ERROR(cm, linebuf[plane], + aom_malloc(cdef_info->allocated_linebuf_size[plane])); + } +} + +static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf, + uint16_t **srcbuf, const int num_planes) { + CdefInfo *cdef_info = &cm->cdef_info; + if (*srcbuf == NULL) + CHECK_MEM_ERROR(cm, *srcbuf, + aom_memalign(16, cdef_info->allocated_srcbuf_size)); + + for (int plane = 0; plane < num_planes; plane++) { + if (colbuf[plane] == NULL) + CHECK_MEM_ERROR(cm, colbuf[plane], + aom_malloc(cdef_info->allocated_colbuf_size[plane])); + } +} + +static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm, + AV1CdefRowSync **cdef_row_mt, + const int num_mi_rows) { + if (*cdef_row_mt != NULL) return; + + CHECK_MEM_ERROR(cm, *cdef_row_mt, + aom_calloc(num_mi_rows, sizeof(**cdef_row_mt))); +#if CONFIG_MULTITHREAD + for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { + CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_, + aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_))); + pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL); + + CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_, + aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_))); + pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL); + } +#endif // CONFIG_MULTITHREAD +} + +void av1_alloc_cdef_buffers(AV1_COMMON *const cm, + AV1CdefWorkerData **cdef_worker, + AV1CdefSync *cdef_sync, int num_workers, + int init_worker) { + const int num_planes = av1_num_planes(cm); + size_t new_linebuf_size[MAX_MB_PLANE] = { 0 }; + size_t new_colbuf_size[MAX_MB_PLANE] = { 0 }; + size_t new_srcbuf_size = 0; + CdefInfo *const cdef_info = &cm->cdef_info; + // Check for configuration change + const int num_mi_rows = + (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int is_num_workers_changed = + cdef_info->allocated_num_workers != num_workers; + const int is_cdef_enabled = + cm->seq_params->enable_cdef && !cm->tiles.large_scale; + + // num-bufs=3 represents ping-pong buffers for top linebuf, + // followed by bottom linebuf. + // ping-pong is to avoid top linebuf over-write by consecutive row. + int num_bufs = 3; + if (num_workers > 1) + num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + if (is_cdef_enabled) { + // Calculate src buffer size + new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE; + for (int plane = 0; plane < num_planes; plane++) { + const int shift = + plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x; + // Calculate top and bottom line buffer size + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs * + (CDEF_VBORDER << 1) * (luma_stride >> shift); + // Calculate column buffer size + const int block_height = + (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER; + new_colbuf_size[plane] = + sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER; + } + } + + // Free src, line and column buffers for worker 0 in case of reallocation + free_cdef_linebuf_conditional(cm, new_linebuf_size); + free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf, + new_colbuf_size, new_srcbuf_size); + + // The flag init_worker indicates if cdef_worker has to be allocated for the + // frame. This is passed as 1 always from decoder. At encoder side, it is 0 + // when called for parallel frames during FPMT (where cdef_worker is shared + // across parallel frames) and 1 otherwise. + if (*cdef_worker != NULL && init_worker) { + if (is_num_workers_changed) { + // Free src and column buffers for remaining workers in case of change in + // num_workers + for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) + free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); + + aom_free(*cdef_worker); + *cdef_worker = NULL; + } else if (num_workers > 1) { + // Free src and column buffers for remaining workers in case of + // reallocation + for (int idx = num_workers - 1; idx >= 1; idx--) + free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf, + &(*cdef_worker)[idx].srcbuf, new_colbuf_size, + new_srcbuf_size); + } + } + + if (cdef_info->allocated_mi_rows != num_mi_rows) + free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows); + + // Store allocated sizes for reallocation + cdef_info->allocated_srcbuf_size = new_srcbuf_size; + av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size); + av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size); + // Store configuration to check change in configuration + cdef_info->allocated_mi_rows = num_mi_rows; + cdef_info->allocated_num_workers = num_workers; + + if (!is_cdef_enabled) return; + + // Memory allocation of column buffer & source buffer (worker_0). + alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes); + alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes); + + if (num_workers < 2) return; + + if (init_worker) { + if (*cdef_worker == NULL) + CHECK_MEM_ERROR(cm, *cdef_worker, + aom_calloc(num_workers, sizeof(**cdef_worker))); + + // Memory allocation of column buffer & source buffer for remaining workers. + for (int idx = num_workers - 1; idx >= 1; idx--) + alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf, + &(*cdef_worker)[idx].srcbuf, num_planes); + } + + alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt, + cdef_info->allocated_mi_rows); +} + +// Allocate buffers which are independent of restoration_unit_size +void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) { + const int num_planes = av1_num_planes(cm); + + if (cm->rst_tmpbuf == NULL && is_sgr_enabled) { + CHECK_MEM_ERROR(cm, cm->rst_tmpbuf, + (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); + } + + if (cm->rlbs == NULL) { + CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers))); + } + + // For striped loop restoration, we divide each plane into "stripes", + // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET + // luma pixels to match the output from CDEF. We will need to store 2 * + // RESTORATION_CTX_VERT lines of data for each stripe. + int mi_h = cm->mi_params.mi_rows; + const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2); + const int num_stripes = (ext_h + 63) / 64; + + // Now we need to allocate enough space to store the line buffers for the + // stripes + const int frame_w = cm->superres_upscaled_width; + const int use_highbd = cm->seq_params->use_highbitdepth; + + for (int p = 0; p < num_planes; ++p) { + const int is_uv = p > 0; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ; + const int stride = ALIGN_POWER_OF_TWO(plane_w, 5); + const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT + << use_highbd; + RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; + + if (buf_size != boundaries->stripe_boundary_size || + boundaries->stripe_boundary_above == NULL || + boundaries->stripe_boundary_below == NULL) { + aom_free(boundaries->stripe_boundary_above); + aom_free(boundaries->stripe_boundary_below); + + CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above, + (uint8_t *)aom_memalign(32, buf_size)); + CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below, + (uint8_t *)aom_memalign(32, buf_size)); + + boundaries->stripe_boundary_size = buf_size; + } + boundaries->stripe_boundary_stride = stride; + } +} + +void av1_free_restoration_buffers(AV1_COMMON *cm) { + int p; + for (p = 0; p < MAX_MB_PLANE; ++p) + av1_free_restoration_struct(&cm->rst_info[p]); + aom_free(cm->rst_tmpbuf); + cm->rst_tmpbuf = NULL; + aom_free(cm->rlbs); + cm->rlbs = NULL; + for (p = 0; p < MAX_MB_PLANE; ++p) { + RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; + aom_free(boundaries->stripe_boundary_above); + aom_free(boundaries->stripe_boundary_below); + boundaries->stripe_boundary_above = NULL; + boundaries->stripe_boundary_below = NULL; + } + + aom_free_frame_buffer(&cm->rst_frame); +} + +void av1_free_above_context_buffers(CommonContexts *above_contexts) { + int i; + const int num_planes = above_contexts->num_planes; + + for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) { + for (i = 0; i < num_planes; i++) { + if (above_contexts->entropy[i] == NULL) break; + aom_free(above_contexts->entropy[i][tile_row]); + above_contexts->entropy[i][tile_row] = NULL; + } + if (above_contexts->partition != NULL) { + aom_free(above_contexts->partition[tile_row]); + above_contexts->partition[tile_row] = NULL; + } + + if (above_contexts->txfm != NULL) { + aom_free(above_contexts->txfm[tile_row]); + above_contexts->txfm[tile_row] = NULL; + } + } + for (i = 0; i < num_planes; i++) { + aom_free(above_contexts->entropy[i]); + above_contexts->entropy[i] = NULL; + } + aom_free(above_contexts->partition); + above_contexts->partition = NULL; + + aom_free(above_contexts->txfm); + above_contexts->txfm = NULL; + + above_contexts->num_tile_rows = 0; + above_contexts->num_mi_cols = 0; + above_contexts->num_planes = 0; +} + +void av1_free_context_buffers(AV1_COMMON *cm) { + if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params); + + av1_free_above_context_buffers(&cm->above_contexts); +} + +int av1_alloc_above_context_buffers(CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes) { + const int aligned_mi_cols = + ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2); + + // Allocate above context buffers + above_contexts->num_tile_rows = num_tile_rows; + above_contexts->num_mi_cols = aligned_mi_cols; + above_contexts->num_planes = num_planes; + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->entropy[0])); + if (!above_contexts->entropy[plane_idx]) return 1; + } + + above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->partition)); + if (!above_contexts->partition) return 1; + + above_contexts->txfm = + (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm)); + if (!above_contexts->txfm) return 1; + + for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) { + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx][tile_row] = + (ENTROPY_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row])); + if (!above_contexts->entropy[plane_idx][tile_row]) return 1; + } + + above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->partition[tile_row])); + if (!above_contexts->partition[tile_row]) return 1; + + above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row])); + if (!above_contexts->txfm[tile_row]) return 1; + } + + return 0; +} + +// Allocate the dynamically allocated arrays in 'mi_params' assuming +// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of +// the struct members. +static int alloc_mi(CommonModeInfoParams *mi_params) { + const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows); + const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows; + const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int alloc_mi_size = + mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d); + + if (mi_params->mi_alloc_size < alloc_mi_size || + mi_params->mi_grid_size < mi_grid_size) { + mi_params->free_mi(mi_params); + + mi_params->mi_alloc = + aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc)); + if (!mi_params->mi_alloc) return 1; + mi_params->mi_alloc_size = alloc_mi_size; + + mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( + mi_grid_size, sizeof(*mi_params->mi_grid_base)); + if (!mi_params->mi_grid_base) return 1; + + mi_params->tx_type_map = + aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); + if (!mi_params->tx_type_map) return 1; + mi_params->mi_grid_size = mi_grid_size; + } + + return 0; +} + +int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height, + BLOCK_SIZE min_partition_size) { + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->set_mb_mi(mi_params, width, height, min_partition_size); + if (alloc_mi(mi_params)) goto fail; + return 0; + +fail: + // clear the mi_* values to force a realloc on resync + mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4); + av1_free_context_buffers(cm); + return 1; +} + +void av1_remove_common(AV1_COMMON *cm) { + av1_free_context_buffers(cm); + + aom_free(cm->fc); + cm->fc = NULL; + aom_free(cm->default_frame_context); + cm->default_frame_context = NULL; +} + +void av1_init_mi_buffers(CommonModeInfoParams *mi_params) { + mi_params->setup_mi(mi_params); +} diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h new file mode 100644 index 0000000000..d31b4c56b6 --- /dev/null +++ b/third_party/aom/av1/common/alloccommon.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_ +#define AOM_AV1_COMMON_ALLOCCOMMON_H_ + +#define INVALID_IDX -1 // Invalid buffer index. + +#include + +#include "config/aom_config.h" + +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; +struct BufferPool; +struct CommonContexts; +struct CommonModeInfoParams; +struct AV1CdefWorker; +struct AV1CdefSyncData; + +void av1_remove_common(struct AV1Common *cm); + +int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes); +void av1_free_above_context_buffers(struct CommonContexts *above_contexts); +int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height, + BLOCK_SIZE min_partition_size); +void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params); +void av1_free_context_buffers(struct AV1Common *cm); + +void av1_free_ref_frame_buffers(struct BufferPool *pool); +void av1_alloc_cdef_buffers(struct AV1Common *const cm, + struct AV1CdefWorker **cdef_worker, + struct AV1CdefSyncData *cdef_sync, int num_workers, + int init_worker); +void av1_free_cdef_buffers(struct AV1Common *const cm, + struct AV1CdefWorker **cdef_worker, + struct AV1CdefSyncData *cdef_sync); +void av1_alloc_restoration_buffers(struct AV1Common *cm, bool is_sgr_enabled); +void av1_free_restoration_buffers(struct AV1Common *cm); + +int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height); +void av1_free_state_buffers(struct AV1Common *cm); + +int av1_get_MBs(int width, int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_ diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c new file mode 100644 index 0000000000..09e5166b14 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c @@ -0,0 +1,4217 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/arm/av1_inv_txfm_neon.h" + +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +// 1D functions +static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { + { av1_idct4, av1_iadst4, av1_iidentity4_c }, + { av1_idct8, av1_iadst8, av1_iidentity8_c }, + { av1_idct16, av1_iadst16, av1_iidentity16_c }, + { av1_idct32, NULL, NULL }, + { av1_idct64, NULL, NULL }, +}; + +static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + int16x8_t temp_output; + for (int i = 0; i < height; ++i, j += step) { + temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); + temp_output = vaddq_s16(temp_output, in[j]); + vst1_u8(output, vqmovun_s16(temp_output)); + output += stride; + } +} + +static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, + int16x8_t res0, + int16x8_t res1) { + int16x8_t temp_output[2]; + uint8x16_t temp_output_8q; + temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); + temp_output[0] = vaddq_s16(temp_output[0], res0); + temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); + temp_output[1] = vaddq_s16(temp_output[1], res1); + temp_output_8q = + vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); + return temp_output_8q; +} + +static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, int height) { + uint8x16_t temp_output_8q; + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp_output_8q = vld1q_u8(output + i * stride); + temp_output_8q = + lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); + vst1q_u8((output + i * stride), temp_output_8q); + } +} + +static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, + int value) { + for (int i = 0; i < size; i++) { + a[i] = vdupq_n_s16((int16_t)value); + } +} + +static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1, + int16_t coef2, int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0_l, s0_h, s1_l, s1_h; + int16x4_t v0[2], v1[2]; + + s0_l = vmull_n_s16(vget_low_s16(in0), coef1); + s0_h = vmull_n_s16(vget_high_s16(in0), coef1); + s1_l = vmull_n_s16(vget_low_s16(in0), coef2); + s1_h = vmull_n_s16(vget_high_s16(in0), coef2); + + v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { + int32x4_t t0[2], t1[2]; + int16x4_t v0[2], v1[2]; + + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + + v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); + + x[0] = vcombine_s16(v0[0], v0[1]); + x[1] = vcombine_s16(v1[0], v1[1]); +} + +static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + int16x4_t val = vdup_n_s16(c0); + val = vset_lane_s16(c1, val, 1); + val = vset_lane_s16(c2, val, 2); + val = vset_lane_s16(c3, val, 3); + return val; +} + +static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[20], (int16_t)cospi[44]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[8]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + // Stage 1 + x[0] = in[7]; + x[1] = in[0]; + x[2] = in[5]; + x[3] = in[2]; + x[4] = in[3]; + x[5] = in[4]; + x[6] = in[1]; + x[7] = in[6]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + + // Stage 3 + x[0] = vqaddq_s16(s0, s4); + x[1] = vqaddq_s16(s1, s5); + x[2] = vqaddq_s16(s2, s6); + x[3] = vqaddq_s16(s3, s7); + x[4] = vqsubq_s16(s0, s4); + x[5] = vqsubq_s16(s1, s5); + x[6] = vqsubq_s16(s2, s6); + x[7] = vqsubq_s16(s3, s7); + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + s2 = x[2]; + s3 = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); + + // Stage 5 + x[0] = vqaddq_s16(s0, s2); + x[1] = vqaddq_s16(s1, s3); + x[2] = vqsubq_s16(s0, s2); + x[3] = vqsubq_s16(s1, s3); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vqnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vqnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vqnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vqnegq_s16(x[1]); +} + +static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[8]; + int16x8_t s0, s1, s4, s5; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + + btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[4] = s0; + x[5] = s1; + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + + // Stage 5 + x[0] = s0; + x[1] = s1; + x[2] = s0; + x[3] = s1; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vqnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vqnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vqnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vqnegq_s16(x[1]); +} + +static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[8], step2[8]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + // stage 2 + btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); + + // stage 3 + btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]); + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + + // stage 4 + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]); + + // stage 5 + out[0] = vqaddq_s16(step1[0], step2[7]); + out[1] = vqaddq_s16(step1[1], step1[6]); + out[2] = vqaddq_s16(step1[2], step1[5]); + out[3] = vqaddq_s16(step1[3], step2[4]); + out[4] = vqsubq_s16(step1[3], step2[4]); + out[5] = vqsubq_s16(step1[2], step1[5]); + out[6] = vqsubq_s16(step1[1], step1[6]); + out[7] = vqsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 4 + // stage 5 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; +} + +void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); + for (int i = 0; i < size; i++) { + arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); + } +} + +static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) { + int16x8_t temp[8]; + for (int i = 0; i < size; ++i) { + temp[i] = input[size - 1 - i]; + } + for (int i = 0; i < size; ++i) { + input[i] = temp[i]; + } +} + +static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input, + int stride, + int16x8_t *const a, + int out_size) { + for (int i = 0; i < out_size; ++i) { + a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), + vmovn_s32(vld1q_s32(input + 4))); + input += stride; + } +} + +static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output, + int txw_idx, int8_t size, int bit) { + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]); + int16x4_t low_i16, high_i16; + int32x4_t low_i32, high_i32; + for (int i = 0; i < size; i++) { + int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale); + int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale); + low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4); + high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4); + low_i16 = vqmovn_s32(low_i32); + high_i16 = vqmovn_s32(high_i32); + output[i] = vcombine_s16(low_i16, high_i16); + } +} + +static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, + int size) { + int32x4_t out_low, out_high; + int16x4_t low, high; + + for (int z = 0; z < size; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); + out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 4 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; +} + +static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); + btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); + btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); + + step2[0] = in[0]; + step2[1] = in[8]; + step2[2] = in[4]; + step2[3] = in[12]; + step2[4] = in[2]; + step2[5] = in[10]; + step2[6] = in[6]; + step2[7] = in[14]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c1 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[4]; + step2[4] = in[2]; + step2[6] = in[6]; + + btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); + + // stage 3 + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[10], (int16_t)cospi[54]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[26], (int16_t)cospi[38]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); + btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); + btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c5); + btf_16_half_neon(x + 6, c5); + btf_16_half_neon(x + 10, c5); + btf_16_half_neon(x + 14, c5); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[10]; + int16x8_t s0, s1, s4, s5; + int16x8_t s8, s9, s12, s13; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[8] = s0; + x[9] = s1; + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + + // Stage 5 + x[0] = t[0]; + x[1] = t[1]; + x[4] = t[0]; + x[5] = t[1]; + x[8] = s8; + x[9] = s9; + x[12] = s8; + x[13] = s9; + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + t[8] = x[8]; + t[9] = x[9]; + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + + // Stage 7 + x[0] = t[0]; + x[1] = t[1]; + x[2] = t[0]; + x[3] = t[1]; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + x[8] = t[8]; + x[9] = t[9]; + x[10] = t[8]; + x[11] = t[9]; + x[12] = s12; + x[13] = s13; + x[14] = s12; + x[15] = s13; + + // Stage 8 + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[1] = in[0]; + x[3] = in[2]; + x[5] = in[4]; + x[7] = in[6]; + x[8] = in[7]; + x[10] = in[5]; + x[12] = in[3]; + x[14] = in[1]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); + btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); + btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); + + btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); + btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); + btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); + btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[34], (int16_t)cospi[30]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[50], (int16_t)cospi[14]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c8 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c9 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); + btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); + btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); + btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); + btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); + btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); + + step2[0] = in[0]; + step2[1] = in[16]; + step2[2] = in[8]; + step2[3] = in[24]; + step2[4] = in[4]; + step2[5] = in[20]; + step2[6] = in[12]; + step2[7] = in[28]; + step2[8] = in[2]; + step2[9] = in[18]; + step2[10] = in[10]; + step2[11] = in[26]; + step2[12] = in[6]; + step2[13] = in[22]; + step2[14] = in[14]; + step2[15] = in[30]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); + btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); + btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); + btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[4] = step2[4]; + step1[5] = step2[5]; + step1[6] = step2[6]; + step1[7] = step2[7]; + + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); + btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); + btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); + btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); + btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[1], step1[2]); + step2[2] = vqsubq_s16(step1[1], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; +} + +static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[4]; + step2[8] = in[2]; + step2[12] = in[6]; + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + // stage 3 + step1[0] = step2[0]; + step1[4] = step2[4]; + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[16] = step2[16]; + step1[17] = step2[16]; + step1[18] = step2[19]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[21] = step2[20]; + step1[22] = step2[23]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[24]; + step1[26] = step2[27]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[29] = step2[28]; + step1[30] = step2[31]; + step1[31] = step2[31]; + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[8]; + step2[10] = step1[11]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[13] = step1[12]; + step2[14] = step1[15]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); + + step1[4] = step2[4]; + step1[5] = step2[4]; + step1[6] = step2[7]; + step1[7] = step2[7]; + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); + + step2[0] = step1[0]; + step2[1] = step1[0]; + step2[2] = step1[0]; + step2[3] = step1[0]; + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); + btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); + btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + step2[0] = in[0]; + step2[2] = in[8]; + step2[4] = in[4]; + step2[6] = in[12]; + step2[8] = in[2]; + step2[10] = in[10]; + step2[12] = in[6]; + step2[14] = in[14]; + + // stage 3 + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); + btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[0], step1[2]); + step2[2] = vqsubq_s16(step1[0], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} +static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]); + btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]); + btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]); + btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]); + + step1[0] = vqaddq_s16(step2[0], step2[15]); + step1[1] = vqaddq_s16(step2[1], step2[14]); + step1[2] = vqaddq_s16(step2[2], step2[13]); + step1[3] = vqaddq_s16(step2[3], step2[12]); + step1[4] = vqaddq_s16(step2[4], step2[11]); + step1[5] = vqaddq_s16(step2[5], step2[10]); + step1[6] = vqaddq_s16(step2[6], step2[9]); + step1[7] = vqaddq_s16(step2[7], step2[8]); + step1[8] = vqsubq_s16(step2[7], step2[8]); + step1[9] = vqsubq_s16(step2[6], step2[9]); + step1[10] = vqsubq_s16(step2[5], step2[10]); + step1[11] = vqsubq_s16(step2[4], step2[11]); + step1[12] = vqsubq_s16(step2[3], step2[12]); + step1[13] = vqsubq_s16(step2[2], step2[13]); + step1[14] = vqsubq_s16(step2[1], step2[14]); + step1[15] = vqsubq_s16(step2[0], step2[15]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[47]); + step1[33] = vqaddq_s16(step2[33], step2[46]); + step1[34] = vqaddq_s16(step2[34], step2[45]); + step1[35] = vqaddq_s16(step2[35], step2[44]); + step1[36] = vqaddq_s16(step2[36], step2[43]); + step1[37] = vqaddq_s16(step2[37], step2[42]); + step1[38] = vqaddq_s16(step2[38], step2[41]); + step1[39] = vqaddq_s16(step2[39], step2[40]); + step1[40] = vqsubq_s16(step2[39], step2[40]); + step1[41] = vqsubq_s16(step2[38], step2[41]); + step1[42] = vqsubq_s16(step2[37], step2[42]); + step1[43] = vqsubq_s16(step2[36], step2[43]); + step1[44] = vqsubq_s16(step2[35], step2[44]); + step1[45] = vqsubq_s16(step2[34], step2[45]); + step1[46] = vqsubq_s16(step2[33], step2[46]); + step1[47] = vqsubq_s16(step2[32], step2[47]); + step1[48] = vqsubq_s16(step2[63], step2[48]); + step1[49] = vqsubq_s16(step2[62], step2[49]); + step1[50] = vqsubq_s16(step2[61], step2[50]); + step1[51] = vqsubq_s16(step2[60], step2[51]); + step1[52] = vqsubq_s16(step2[59], step2[52]); + step1[53] = vqsubq_s16(step2[58], step2[53]); + step1[54] = vqsubq_s16(step2[57], step2[54]); + step1[55] = vqsubq_s16(step2[56], step2[55]); + step1[56] = vqaddq_s16(step2[56], step2[55]); + step1[57] = vqaddq_s16(step2[57], step2[54]); + step1[58] = vqaddq_s16(step2[58], step2[53]); + step1[59] = vqaddq_s16(step2[59], step2[52]); + step1[60] = vqaddq_s16(step2[60], step2[51]); + step1[61] = vqaddq_s16(step2[61], step2[50]); + step1[62] = vqaddq_s16(step2[62], step2[49]); + step1[63] = vqaddq_s16(step2[63], step2[48]); +} + +static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]); + btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]); + btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]); + btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]); + btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]); + btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]); + btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]); + btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]); + + step2[0] = vqaddq_s16(step1[0], step1[31]); + step2[1] = vqaddq_s16(step1[1], step1[30]); + step2[2] = vqaddq_s16(step1[2], step1[29]); + step2[3] = vqaddq_s16(step1[3], step1[28]); + step2[4] = vqaddq_s16(step1[4], step1[27]); + step2[5] = vqaddq_s16(step1[5], step1[26]); + step2[6] = vqaddq_s16(step1[6], step1[25]); + step2[7] = vqaddq_s16(step1[7], step1[24]); + step2[8] = vqaddq_s16(step1[8], step1[23]); + step2[9] = vqaddq_s16(step1[9], step1[22]); + step2[10] = vqaddq_s16(step1[10], step1[21]); + step2[11] = vqaddq_s16(step1[11], step1[20]); + step2[12] = vqaddq_s16(step1[12], step1[19]); + step2[13] = vqaddq_s16(step1[13], step1[18]); + step2[14] = vqaddq_s16(step1[14], step1[17]); + step2[15] = vqaddq_s16(step1[15], step1[16]); + step2[16] = vqsubq_s16(step1[15], step1[16]); + step2[17] = vqsubq_s16(step1[14], step1[17]); + step2[18] = vqsubq_s16(step1[13], step1[18]); + step2[19] = vqsubq_s16(step1[12], step1[19]); + step2[20] = vqsubq_s16(step1[11], step1[20]); + step2[21] = vqsubq_s16(step1[10], step1[21]); + step2[22] = vqsubq_s16(step1[9], step1[22]); + step2[23] = vqsubq_s16(step1[8], step1[23]); + step2[24] = vqsubq_s16(step1[7], step1[24]); + step2[25] = vqsubq_s16(step1[6], step1[25]); + step2[26] = vqsubq_s16(step1[5], step1[26]); + step2[27] = vqsubq_s16(step1[4], step1[27]); + step2[28] = vqsubq_s16(step1[3], step1[28]); + step2[29] = vqsubq_s16(step1[2], step1[29]); + step2[30] = vqsubq_s16(step1[1], step1[30]); + step2[31] = vqsubq_s16(step1[0], step1[31]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[37] = step1[37]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[58] = step1[58]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; +} + +static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[16]; + step2[4] = in[8]; + step2[6] = in[24]; + step2[8] = in[4]; + step2[10] = in[20]; + step2[12] = in[12]; + step2[14] = in[28]; + step2[16] = in[2]; + step2[18] = in[18]; + step2[20] = in[10]; + step2[22] = in[26]; + step2[24] = in[6]; + step2[26] = in[22]; + step2[28] = in[14]; + step2[30] = in[30]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]); + btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]); + btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]); + btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]); + btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[8] = step2[8]; + step1[10] = step2[10]; + step1[12] = step2[12]; + step1[14] = step2[14]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]); + btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]); + btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = vqaddq_s16(step2[32], step2[33]); + step1[33] = vqsubq_s16(step2[32], step2[33]); + step1[34] = vqsubq_s16(step2[35], step2[34]); + step1[35] = vqaddq_s16(step2[35], step2[34]); + step1[36] = vqaddq_s16(step2[36], step2[37]); + step1[37] = vqsubq_s16(step2[36], step2[37]); + step1[38] = vqsubq_s16(step2[39], step2[38]); + step1[39] = vqaddq_s16(step2[39], step2[38]); + step1[40] = vqaddq_s16(step2[40], step2[41]); + step1[41] = vqsubq_s16(step2[40], step2[41]); + step1[42] = vqsubq_s16(step2[43], step2[42]); + step1[43] = vqaddq_s16(step2[43], step2[42]); + step1[44] = vqaddq_s16(step2[44], step2[45]); + step1[45] = vqsubq_s16(step2[44], step2[45]); + step1[46] = vqsubq_s16(step2[47], step2[46]); + step1[47] = vqaddq_s16(step2[47], step2[46]); + step1[48] = vqaddq_s16(step2[48], step2[49]); + step1[49] = vqsubq_s16(step2[48], step2[49]); + step1[50] = vqsubq_s16(step2[51], step2[50]); + step1[51] = vqaddq_s16(step2[51], step2[50]); + step1[52] = vqaddq_s16(step2[52], step2[53]); + step1[53] = vqsubq_s16(step2[52], step2[53]); + step1[54] = vqsubq_s16(step2[55], step2[54]); + step1[55] = vqaddq_s16(step2[55], step2[54]); + step1[56] = vqaddq_s16(step2[56], step2[57]); + step1[57] = vqsubq_s16(step2[56], step2[57]); + step1[58] = vqsubq_s16(step2[59], step2[58]); + step1[59] = vqaddq_s16(step2[59], step2[58]); + step1[60] = vqaddq_s16(step2[60], step2[61]); + step1[61] = vqsubq_s16(step2[60], step2[61]); + step1[62] = vqsubq_s16(step2[63], step2[62]); + step1[63] = vqaddq_s16(step2[63], step2[62]); + + // stage 4 + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = vqaddq_s16(step1[16], step1[17]); + step2[17] = vqsubq_s16(step1[16], step1[17]); + step2[18] = vqsubq_s16(step1[19], step1[18]); + step2[19] = vqaddq_s16(step1[19], step1[18]); + step2[20] = vqaddq_s16(step1[20], step1[21]); + step2[21] = vqsubq_s16(step1[20], step1[21]); + step2[22] = vqsubq_s16(step1[23], step1[22]); + step2[23] = vqaddq_s16(step1[23], step1[22]); + step2[24] = vqaddq_s16(step1[24], step1[25]); + step2[25] = vqsubq_s16(step1[24], step1[25]); + step2[26] = vqsubq_s16(step1[27], step1[26]); + step2[27] = vqaddq_s16(step1[27], step1[26]); + step2[28] = vqaddq_s16(step1[28], step1[29]); + step2[29] = vqsubq_s16(step1[28], step1[29]); + step2[30] = vqsubq_s16(step1[31], step1[30]); + step2[31] = vqaddq_s16(step1[31], step1[30]); + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + step1[2] = step2[2]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + + t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; + out[32] = step1; + out[33] = step1; + out[34] = step1; + out[35] = step1; + out[36] = step1; + out[37] = step1; + out[38] = step1; + out[39] = step1; + out[40] = step1; + out[41] = step1; + out[42] = step1; + out[43] = step1; + out[44] = step1; + out[45] = step1; + out[46] = step1; + out[47] = step1; + out[48] = step1; + out[49] = step1; + out[50] = step1; + out[51] = step1; + out[52] = step1; + out[53] = step1; + out[54] = step1; + out[55] = step1; + out[56] = step1; + out[57] = step1; + out[58] = step1; + out[59] = step1; + out[60] = step1; + out[61] = step1; + out[62] = step1; + out[63] = step1; +} + +static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[8] = in[4]; + step2[16] = in[2]; + step2[24] = in[6]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[8] = step2[8]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + step1[16] = step2[16]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[31] = step2[31]; + step1[32] = step2[32]; + step1[33] = step2[33]; + step1[34] = step2[33]; + step1[35] = step2[32]; + step1[36] = step2[39]; + step1[37] = step2[38]; + step1[38] = step2[38]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[41]; + step1[42] = step2[41]; + step1[43] = step2[40]; + step1[44] = step2[47]; + step1[45] = step2[46]; + step1[46] = step2[46]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[49]; + step1[50] = step2[49]; + step1[51] = step2[48]; + step1[52] = step2[55]; + step1[53] = step2[54]; + step1[54] = step2[54]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[57]; + step1[58] = step2[57]; + step1[59] = step2[56]; + step1[60] = step2[63]; + step1[61] = step2[62]; + step1[62] = step2[62]; + step1[63] = step2[63]; + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[17]; + step2[19] = step1[16]; + step2[20] = step1[23]; + step2[21] = step1[22]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[26] = step1[25]; + step2[27] = step1[24]; + step2[28] = step1[31]; + step2[29] = step1[30]; + step2[30] = step1[30]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[10] = step2[9]; + step1[11] = step2[8]; + step1[12] = step2[15]; + step1[13] = step2[14]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[3]; + step2[5] = step1[2]; + step2[6] = step1[1]; + step2[7] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[8]; + step2[8] = in[4]; + step2[12] = in[12]; + step2[16] = in[2]; + step2[20] = in[10]; + step2[24] = in[6]; + step2[28] = in[14]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[4] = step2[4]; + step1[8] = step2[8]; + step1[12] = step2[12]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[34] = step2[35]; + step1[35] = step2[35]; + step1[36] = step2[36]; + step1[37] = step2[36]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[42] = step2[43]; + step1[43] = step2[43]; + step1[44] = step2[44]; + step1[45] = step2[44]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[50] = step2[51]; + step1[51] = step2[51]; + step1[52] = step2[52]; + step1[53] = step2[52]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[58] = step2[59]; + step1[59] = step2[59]; + step1[60] = step2[60]; + step1[61] = step2[60]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + step2[4] = step1[4]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +// Functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_neon + lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8_low1_neon, idct8_neon, NULL, NULL }, + { iadst8_low1_neon, iadst8_neon, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL }, + { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon, + idct64_low32_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)tx_type; + int16x8_t a[32 * 4]; + int16x8_t b[32 * 4]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = txfm_size_row; + int temp_b = 0; + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = txfm_size_row; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + row_txfm(cur_a, cur_a, INV_COS_BIT); + av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + flip_buf_ud_neon(&cur_a[j * 8], 8); + transpose_arrays_s16_8x8( + &cur_a[j * 8], + &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = txfm_size_row; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + int temp_b = 0; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X4; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; ++c) + temp_in[c] = input[c * txfm_size_row]; + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X8; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, + NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_8X4; + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, + NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X16; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = input[c * txfm_size_row]; + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_16X4; + DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = input[c * txfm_size_row]; + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[64 * 8]; + int16x8_t b[64 * 8]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + int temp_b = 0; + + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + row_txfm(cur_a, cur_a, INV_COS_BIT); + av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + flip_buf_ud_neon(&cur_a[j * 8], 8); + transpose_arrays_s16_8x8( + &cur_a[j * 8], + &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_universe_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + default: + lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob); + break; + + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob); + break; + + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob); + break; + + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob); + break; + + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob); + break; + + default: + lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} +void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h new file mode 100644 index 0000000000..97099c2042 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" + +typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, + const int8_t cos_bit, + const int8_t *stage_ptr); +typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output, + int8_t cos_bit); + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob / (eoby_max + 1); + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c new file mode 100644 index 0000000000..f955a379f7 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c @@ -0,0 +1,30 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +void av1_round_shift_array_neon(int32_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + for (int i = 0; i < size; i += 4) { + int32x4_t tmp_q_s32 = vld1q_s32(arr); + tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4); + vst1q_s32(arr, tmp_q_s32); + arr += 4; + } +} diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c new file mode 100644 index 0000000000..7afb1a909d --- /dev/null +++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c @@ -0,0 +1,102 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" + +void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 2); + assert(w >= 2); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (w > 8) { + do { + int i = 0; + do { + uint8x16_t m0 = vld1q_u8(mask + i); + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + const uint8x8_t m0 = vld1_u8(mask); + do { + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(dst, blend); + + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 4) { + const uint8x8_t m0 = load_unaligned_dup_u8_4x2(mask); + do { + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 2 && h >= 16) { + const uint8x8_t m0 = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask)); + do { + uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + store_u8x2_strided_x2(dst, dst_stride, blend); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + aom_blend_a64_hmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, w, h); + } +} diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c new file mode 100644 index 0000000000..9aea29992a --- /dev/null +++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c @@ -0,0 +1,112 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 2); + assert(w >= 2); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (w > 8) { + do { + uint8x16_t m0 = vdupq_n_u8(mask[0]); + int i = 0; + do { + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + mask += 1; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8x8_t m0 = vdup_n_u8(mask[0]); + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(dst, blend); + + mask += 1; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 4) { + do { + const uint16x4_t m0 = vdup_n_u16((uint16_t)mask[0]); + const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[1]); + const uint8x8_t m = vmovn_u16(vcombine_u16(m0, m1)); + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 2; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 2 && h >= 16) { + do { + uint16x4_t m0 = vdup_n_u16(0); + m0 = vld1_lane_u16((uint16_t *)mask, m0, 0); + uint8x8_t m = + vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0]; + uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); + + store_u8x2_strided_x2(dst, dst_stride, blend); + + mask += 2; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + aom_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, w, h); + } +} diff --git a/third_party/aom/av1/common/arm/cdef_block_neon.c b/third_party/aom/av1/common/arm/cdef_block_neon.c new file mode 100644 index 0000000000..53d3a9f1e0 --- /dev/null +++ b/third_party/aom/av1/common/arm/cdef_block_neon.c @@ -0,0 +1,1355 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/cdef_block.h" + +void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + do { + const uint8_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 16) { + uint8x16_t row = vld1q_u8(src_ptr + w); + uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } }; + vst2q_u8((uint8_t *)(dst_ptr + w), row_u16); + + w += 16; + } + if (width - w >= 8) { + uint8x8_t row = vld1_u8(src_ptr + w); + vst1q_u16(dst_ptr + w, vmovl_u8(row)); + w += 8; + } + if (width - w == 4) { + for (int i = w; i < w + 4; i++) { + dst_ptr[i] = src_ptr[i]; + } + } + + src += sstride; + dst += dstride; + } while (--height != 0); +} + +void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 8) { + uint16x8_t row = vld1q_u16(src_ptr + w); + vst1q_u16(dst_ptr + w, row); + + w += 8; + } + if (width - w == 4) { + uint16x4_t row = vld1_u16(src_ptr + w); + vst1_u16(dst_ptr + w, row); + } + + src += sstride; + dst += dstride; + } while (--height != 0); +} + +// partial A is a 16-bit vector of the form: +// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: +// [0 y1 y2 y3 y4 y5 y6 y7]. +// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... +// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 +// and const2. +static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala, + int16x8_t partialb, + uint32x4_t const1, + uint32x4_t const2) { + // Reverse partial B. + // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. + uint8x16_t pattern = vreinterpretq_u8_u64( + vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), + vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); + +#if AOM_ARCH_AARCH64 + partialb = + vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern)); +#else + int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)), + vget_high_s8(vreinterpretq_s8_s16(partialb)) } }; + int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); + int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); + partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); +#endif + + // Square and add the corresponding x and y values. + int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); + cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); + int32x4_t cost_hi = + vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); + cost_hi = + vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); + + // Multiply by constant. + uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); + return cost; +} + +// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal +// down-right, 6 is vertical). +// +// For each direction the lines are shifted so that we can perform a +// basic sum on each vector element. For example, direction 5 is "south by +// southeast", so we need to add the pixels along each line i below: +// +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// For this to fit nicely in vectors, the lines need to be shifted like so: +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// In this configuration we can now perform SIMD additions to get the cost +// along direction 5. Since this won't fit into a single 128-bit vector, we use +// two of them to compute each half of the new configuration, and pad the empty +// spaces with zeros. Similar shifting is done for other directions, except +// direction 6 which is straightforward as it's the vertical direction. +static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); + + // Partial sums for lines 0 and 1. + int16x8_t partial4a = vextq_s16(zero, lines[0], 1); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); + int16x8_t partial4b = vextq_s16(lines[0], zero, 1); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); + int16x8_t tmp = vaddq_s16(lines[0], lines[1]); + int16x8_t partial5a = vextq_s16(zero, tmp, 3); + int16x8_t partial5b = vextq_s16(tmp, zero, 3); + int16x8_t partial7a = vextq_s16(zero, tmp, 6); + int16x8_t partial7b = vextq_s16(tmp, zero, 6); + int16x8_t partial6 = tmp; + + // Partial sums for lines 2 and 3. + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); + tmp = vaddq_s16(lines[2], lines[3]); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); + partial6 = vaddq_s16(partial6, tmp); + + // Partial sums for lines 4 and 5. + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); + tmp = vaddq_s16(lines[4], lines[5]); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); + partial6 = vaddq_s16(partial6, tmp); + + // Partial sums for lines 6 and 7. + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); + partial4a = vaddq_s16(partial4a, lines[7]); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); + tmp = vaddq_s16(lines[6], lines[7]); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); + partial6 = vaddq_s16(partial6, tmp); + + uint32x4_t const0 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), + vcreate_u64((uint64_t)210 << 32 | 280))); + uint32x4_t const1 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), + vcreate_u64((uint64_t)105 << 32 | 120))); + uint32x4_t const2 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420))); + uint32x4_t const3 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140), + vcreate_u64((uint64_t)105 << 32 | 105))); + + // Compute costs in terms of partial sums. + int32x4_t partial6_s32 = + vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); + partial6_s32 = + vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); + costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); + costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); + costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; +} + +static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala, + int16x8_t partialb, + int16x8_t partialc, + uint32x4_t const0) { + // Reverse partial c. + // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }. + uint8x16_t pattern = vreinterpretq_u8_u64( + vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a), + vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302))); + +#if AOM_ARCH_AARCH64 + partialc = + vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern)); +#else + int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)), + vget_high_s8(vreinterpretq_s8_s16(partialc)) } }; + int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); + int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); + partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); +#endif + + int32x4_t partiala_s32 = vpaddlq_s16(partiala); + int32x4_t partialb_s32 = vpaddlq_s16(partialb); + int32x4_t partialc_s32 = vpaddlq_s16(partialc); + + partiala_s32 = vmulq_s32(partiala_s32, partiala_s32); + partialb_s32 = vmulq_s32(partialb_s32, partialb_s32); + partialc_s32 = vmulq_s32(partialc_s32, partialc_s32); + + partiala_s32 = vaddq_s32(partiala_s32, partialc_s32); + + uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0); + return cost; +} + +// This function computes the cost along directions 0, 1, 2, 3. (0 means +// 45-degree up-right, 2 is horizontal). +// +// For direction 1 and 3 ("east northeast" and "east southeast") the shifted +// lines need three vectors instead of two. For direction 1 for example, we need +// to compute the sums along the line i below: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Which means we need the following configuration: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Three vectors are needed to compute this, as well as some extra pairwise +// additions. +static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); + + // Compute diagonal directions (1, 2, 3). + // Partial sums for lines 0 and 1. + int16x8_t partial0a = lines[0]; + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7)); + int16x8_t partial0b = vextq_s16(lines[1], zero, 7); + int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6)); + int16x8_t partial1b = vextq_s16(lines[1], zero, 6); + int16x8_t partial3a = vextq_s16(lines[0], zero, 2); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4)); + int16x8_t partial3b = vextq_s16(zero, lines[0], 2); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4)); + + // Partial sums for lines 2 and 3. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2)); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6)); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6)); + partial3b = vaddq_s16(partial3b, lines[3]); + + // Partial sums for lines 4 and 5. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3)); + partial1b = vaddq_s16(partial1b, lines[4]); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6)); + int16x8_t partial1c = vextq_s16(lines[5], zero, 6); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4)); + int16x8_t partial3c = vextq_s16(zero, lines[4], 2); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4)); + + // Partial sums for lines 6 and 7. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6)); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6)); + partial3c = vaddq_s16(partial3c, lines[7]); + + // Special case for direction 2 as it's just a sum along each line. + int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] }; + int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] }; + int32x4_t partial2a = horizontal_add_4d_s16x8(lines03); + int32x4_t partial2b = horizontal_add_4d_s16x8(lines47); + + uint32x4_t partial2a_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a)); + uint32x4_t partial2b_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b)); + + uint32x4_t const0 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), + vcreate_u64((uint64_t)210 << 32 | 280))); + uint32x4_t const1 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), + vcreate_u64((uint64_t)105 << 32 | 120))); + uint32x4_t const2 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420), + vcreate_u64((uint64_t)105 << 32 | 140))); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1); + costs[1] = + fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2); + costs[2] = vaddq_u32(partial2a_u32, partial2b_u32); + costs[2] = vmulq_n_u32(costs[2], 105); + costs[3] = + fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; +} + +int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + uint32_t cost[8]; + uint32_t best_cost = 0; + int best_dir = 0; + int16x8_t lines[8]; + for (int i = 0; i < 8; i++) { + uint16x8_t s = vld1q_u16(&img[i * stride]); + lines[i] = vreinterpretq_s16_u16( + vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128))); + } + + // Compute "mostly vertical" directions. + uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4); + + // Compute "mostly horizontal" directions. + uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost); + + // Find max cost as well as its index to get best_dir. + // The max cost needs to be propagated in the whole vector to find its + // position in the original cost vectors cost03 and cost47. + uint32x4_t cost07 = vmaxq_u32(cost03, cost47); +#if AOM_ARCH_AARCH64 + best_cost = vmaxvq_u32(cost07); + uint32x4_t max_cost = vdupq_n_u32(best_cost); + uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)), + vreinterpretq_u8_u32( + vceqq_u32(max_cost, cost47)) } }; + // idx = { 28, 24, 20, 16, 12, 8, 4, 0 }; + uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); + // Get the lowest 8 bit of each 32-bit elements and reverse them. + uint8x8_t tbl = vqtbl2_u8(costs, idx); + uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0); + best_dir = aom_clzll(a) >> 3; +#else + uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07)); + cost64 = vpmax_u32(cost64, cost64); + uint32x4_t max_cost = vcombine_u32(cost64, cost64); + best_cost = vget_lane_u32(cost64, 0); + uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)), + vmovn_u32(vceqq_u32(max_cost, cost47))); + uint8x8_t idx = + vand_u8(vmovn_u16(costs), + vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); + int sum = horizontal_add_u8x8(idx); + best_dir = get_msb(sum ^ (sum - 1)); +#endif + + // Difference between the optimal variance and the variance along the + // orthogonal direction. Again, the sum(x^2) terms cancel out. + *var = best_cost - cost[(best_dir + 4) & 7]; + // We'd normally divide by 840, but dividing by 1024 is close enough + // for what we're going to do with this. + *var >>= 10; + return best_dir; +} + +void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) +static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b, + unsigned int threshold, int adjdamp) { + uint16x8_t diff = vabdq_u16(a, b); + const uint16x8_t a_gt_b = vcgtq_u16(a, b); + const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold), + vshlq_u16(diff, vdupq_n_s16(-adjdamp))); + const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s)); + return vbslq_s16(a_gt_b, clip, vnegq_s16(clip)); +} + +static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4], + const int *pri_taps, int pri_strength, + int pri_damping, int16x8_t *sum) { + // Near taps + int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping); + int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping); + // sum += pri_taps[0] * (n0 + n1) + n0 = vaddq_s16(n0, n1); + *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]); + + // Far taps + int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping); + int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping); + // sum += pri_taps[1] * (f0 + f1) + f0 = vaddq_s16(f0, f1); + *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]); +} + +static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8], + const int *sec_taps, int sec_strength, + int sec_damping, int16x8_t *sum) { + // Near taps + int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping); + int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping); + int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping); + int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]); + + // Far taps + s0 = constrain16(tap[4], s, sec_strength, sec_damping); + s1 = constrain16(tap[5], s, sec_strength, sec_damping); + s2 = constrain16(tap[6], s, sec_strength, sec_damping); + s3 = constrain16(tap[7], s, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]); +} + +void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + const uint8x8_t res = vqmovn_u16(s); + vst1_u8(dst8, res); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + const uint8x8_t res = vqmovn_u16(s); + store_u8x4_strided_x2(dst8, dstride, res); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + vst1q_u16(dst16, s); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + store_u16x4_strided_x2(dst16, dstride, s); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c new file mode 100644 index 0000000000..0871b4fe06 --- /dev/null +++ b/third_party/aom/av1/common/arm/cfl_neon.c @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset, + int16x8_t sub) { + vst1q_s16(dst + offset, + vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub)); +} + +static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) { + return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset)); +} + +// Load half of a vector and duplicated in other half +static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) { + return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr)); +} + +// Store half of a vector. +static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) { + vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0); +} + +// Store half of a vector. +static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) { + vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0); +} + +static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride)); + vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride)); + vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1)); + } else if (width == 16) { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride)); + vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1)); + } else { + const uint8x8x4_t top = vld4_u8(input); + const uint8x8x4_t bot = vld4_u8(input + input_stride); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]); + uint16x8x2_t sum; + sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); + sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); + vst2q_u16(pred_buf_q3, sum); + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + vsth_u16(pred_buf_q3, vshl_n_u16(top, 2)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + vst1_u16(pred_buf_q3, vshl_n_u16(top, 2)); + } else if (width == 16) { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2)); + } else { + const uint8x8x4_t top = vld4_u8(input); + uint16x8x2_t sum; + // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves) + sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2); + sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2); + vst2q_u16(pred_buf_q3, sum); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3); + vst1_u16(pred_buf_q3, vget_low_u16(top)); + } else if (width == 8) { + const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3); + vst1q_u16(pred_buf_q3, top); + } else { + const uint8x16_t top = vld1q_u8(input); + vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3)); + vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3)); + if (width == 32) { + const uint8x16_t next_top = vld1q_u8(input + 16); + vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3)); + vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3)); + } + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +#if CONFIG_AV1_HIGHBITDEPTH +#if !AOM_ARCH_AARCH64 +uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { + return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), + vpadd_u16(vget_low_u16(b), vget_high_u16(b))); +} +#endif + +static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + const uint16x4_t bot = vld1_u16(input + input_stride); + const uint16x4_t sum = vadd_u16(top, bot); + const uint16x4_t hsum = vpadd_u16(sum, sum); + vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); + } else if (width < 32) { + const uint16x8_t top = vld1q_u16(input); + const uint16x8_t bot = vld1q_u16(input + input_stride); + const uint16x8_t sum = vaddq_u16(top, bot); + if (width == 8) { + const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum)); + vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); + } else { + const uint16x8_t top_1 = vld1q_u16(input + 8); + const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride); + const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1); + const uint16x8_t hsum = vpaddq_u16(sum, sum_1); + vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1)); + } + } else { + const uint16x8x4_t top = vld4q_u16(input); + const uint16x8x4_t bot = vld4q_u16(input + input_stride); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]); + uint16x8x2_t sum; + sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); + sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); + vst2q_u16(pred_buf_q3, sum); + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + const uint16x4_t hsum = vpadd_u16(top, top); + vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); + } else if (width == 8) { + const uint16x4x2_t top = vld2_u16(input); + // equivalent to a vpadd_u16 (because vld2 interleaves) + const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]); + vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); + } else if (width == 16) { + const uint16x8x2_t top = vld2q_u16(input); + // equivalent to a vpaddq_u16 (because vld2q interleaves) + const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]); + vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2)); + } else { + const uint16x8x4_t top = vld4q_u16(input); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]); + uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2), + vshlq_n_u16(hsum_1, 2) } }; + vst2q_u16(pred_buf_q3, result); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + vst1_u16(pred_buf_q3, vshl_n_u16(top, 3)); + } else if (width == 8) { + const uint16x8_t top = vld1q_u16(input); + vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3)); + } else if (width == 16) { + uint16x8x2_t top = vld2q_u16(input); + top.val[0] = vshlq_n_u16(top.val[0], 3); + top.val[1] = vshlq_n_u16(top.val[1], 3); + vst2q_u16(pred_buf_q3, top); + } else { + uint16x8x4_t top = vld4q_u16(input); + top.val[0] = vshlq_n_u16(top.val[0], 3); + top.val[1] = vshlq_n_u16(top.val[1], 3); + top.val[2] = vshlq_n_u16(top.val[2], 3); + top.val[3] = vshlq_n_u16(top.val[3], 3); + vst4q_u16(pred_buf_q3, top); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +CFL_GET_SUBSAMPLE_FUNCTION(neon) + +static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst, + int width, int height, + int round_offset, + const int num_pel_log2) { + const uint16_t *const end = src + height * CFL_BUF_LINE; + + // Round offset is not needed, because NEON will handle the rounding. + (void)round_offset; + + // To optimize the use of the CPU pipeline, we process 4 rows per iteration + const int step = 4 * CFL_BUF_LINE; + + // At this stage, the prediction buffer contains scaled reconstructed luma + // pixels, which are positive integer and only require 15 bits. By using + // unsigned integer for the sum, we can do one addition operation inside 16 + // bits (8 lanes) before having to convert to 32 bits (4 lanes). + const uint16_t *sum_buf = src; + uint32x4_t sum_32x4 = vdupq_n_u32(0); + do { + // For all widths, we load, add and combine the data so it fits in 4 lanes. + if (width == 4) { + const uint16x4_t a0 = + vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE)); + const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE), + vld1_u16(sum_buf + 3 * CFL_BUF_LINE)); + sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1)); + } else if (width == 8) { + const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE); + const uint16x8_t a1 = + vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE); + sum_32x4 = vpadalq_u16(sum_32x4, a0); + sum_32x4 = vpadalq_u16(sum_32x4, a1); + } else { + const uint16x8_t row0 = vldaddq_u16(sum_buf, 8); + const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8); + const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8); + const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8); + sum_32x4 = vpadalq_u16(sum_32x4, row0); + sum_32x4 = vpadalq_u16(sum_32x4, row1); + sum_32x4 = vpadalq_u16(sum_32x4, row2); + sum_32x4 = vpadalq_u16(sum_32x4, row3); + + if (width == 32) { + const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8); + const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8); + const uint16x8_t row2_1 = + vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8); + const uint16x8_t row3_1 = + vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8); + + sum_32x4 = vpadalq_u16(sum_32x4, row0_1); + sum_32x4 = vpadalq_u16(sum_32x4, row1_1); + sum_32x4 = vpadalq_u16(sum_32x4, row2_1); + sum_32x4 = vpadalq_u16(sum_32x4, row3_1); + } + } + sum_buf += step; + } while (sum_buf < end); + + // Permute and add in such a way that each lane contains the block sum. + // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A] +#if AOM_ARCH_AARCH64 + sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); + sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); +#else + uint32x4_t flip = + vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4)); + sum_32x4 = vaddq_u32(sum_32x4, flip); + sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4)); +#endif + + // Computing the average could be done using scalars, but getting off the NEON + // engine introduces latency, so we use vqrshrn. + int16x4_t avg_16x4; + // Constant propagation makes for some ugly code. + switch (num_pel_log2) { + case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break; + case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break; + case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break; + case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break; + case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break; + case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break; + case 10: + avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10)); + break; + default: assert(0); + } + + if (width == 4) { + do { + vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4)); + src += CFL_BUF_LINE; + dst += CFL_BUF_LINE; + } while (src < end); + } else { + const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4); + do { + vldsubstq_s16(dst, src, 0, avg_16x8); + vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8); + + if (width > 8) { + vldsubstq_s16(dst, src, 8, avg_16x8); + vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8); + } + if (width == 32) { + vldsubstq_s16(dst, src, 16, avg_16x8); + vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24, avg_16x8); + vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8); + } + src += step; + dst += step; + } while (src < end); + } +} + +CFL_SUB_AVG_FN(neon) + +// Saturating negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +// Notes: +// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in +// practice, as scaled_luma is the multiplication of two absolute values. +// * In the Intel equivalent, elements in a are zeroed out when the +// corresponding elements in b are zero. Because vsign is used twice in a +// row, with b in the first call becoming a in the second call, there's no +// impact from not zeroing out. +static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) { + const int16x4_t mask = vshr_n_s16(b, 15); + return veor_s16(vadd_s16(a, mask), mask); +} + +// Saturating negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +// Notes: +// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in +// practice, as scaled_luma is the multiplication of two absolute values. +// * In the Intel equivalent, elements in a are zeroed out when the +// corresponding elements in b are zero. Because vsignq is used twice in a +// row, with b in the first call becoming a in the second call, there's no +// impact from not zeroing out. +static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vshrq_n_s16(b, 15); + return veorq_s16(vaddq_s16(a, mask), mask); +} + +static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3, + int16x4_t alpha_sign, int abs_alpha_q12, + int16x4_t dc) { + const int16x4_t ac_q3 = vld1_s16(pred_buf_q3); + const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3); + int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12); + return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc); +} + +static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3); + const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3); + int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12); + return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc); +} + +static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2 + // does not interleave, but is not currently available in the compilier used + // by the AOM build system. + const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3); + const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); + const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); + const int16x8_t scaled_luma_0 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); + const int16x8_t scaled_luma_1 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); + int16x8x2_t result; + result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); + result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); + return result; +} + +static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4 + // does not interleave, but is not currently available in the compilier used + // by the AOM build system. + const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3); + const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); + const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); + const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]); + const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]); + const int16x8_t scaled_luma_0 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); + const int16x8_t scaled_luma_1 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); + const int16x8_t scaled_luma_2 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12); + const int16x8_t scaled_luma_3 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12); + int16x8x4_t result; + result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); + result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); + result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc); + result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc); + return result; +} + +static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; + const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; + if (width == 4) { + const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); + const int16x4_t dc = vdup_n_s16(*dst); + do { + const int16x4_t pred = + predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred))); + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } else { + const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); + const int16x8_t dc = vdupq_n_s16(*dst); + do { + if (width == 8) { + vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign, + abs_alpha_q12, dc))); + } else if (width == 16) { + const int16x8x2_t pred = + predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]), + vqmovun_s16(pred.val[1]) } }; + vst2_u8(dst, predun); + } else { + const int16x8x4_t pred = + predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + const uint8x8x4_t predun = { + { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]), + vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) } + }; + vst4_u8(dst, predun); + } + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } +} + +CFL_PREDICT_FN(neon, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) { + return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0))); +} + +static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) { + return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0))); +} + +static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) { + uint16x8x2_t result; + result.val[0] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); + result.val[1] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); + return result; +} + +static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) { + uint16x8x4_t result; + result.val[0] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); + result.val[1] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); + result.val[2] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0))); + result.val[3] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0))); + return result; +} + +static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + const int max = (1 << bd) - 1; + const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; + const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; + if (width == 4) { + const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); + const int16x4_t dc = vdup_n_s16(*dst); + const int16x4_t max_16x4 = vdup_n_s16(max); + do { + const int16x4_t scaled_luma = + predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst1_u16(dst, clamp_s16(scaled_luma, max_16x4)); + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } else { + const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); + const int16x8_t dc = vdupq_n_s16(*dst); + const int16x8_t max_16x8 = vdupq_n_s16(max); + do { + if (width == 8) { + const int16x8_t pred = + predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst1q_u16(dst, clampq_s16(pred, max_16x8)); + } else if (width == 16) { + const int16x8x2_t pred = + predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst2q_u16(dst, clamp2q_s16(pred, max_16x8)); + } else { + const int16x8x4_t pred = + predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst4q_u16(dst, clamp4q_s16(pred, max_16x8)); + } + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } +} + +CFL_PREDICT_FN(neon, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon.c b/third_party/aom/av1/common/arm/compound_convolve_neon.c new file mode 100644 index 0000000000..6a596234dc --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon.c @@ -0,0 +1,2719 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/arm/compound_convolve_neon.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t x_filter, + const int16x4_t horiz_const) { + int16x4_t sum = horiz_const; + sum = vmla_lane_s16(sum, s0, x_filter, 0); + sum = vmla_lane_s16(sum, s1, x_filter, 1); + sum = vmla_lane_s16(sum, s2, x_filter, 2); + sum = vmla_lane_s16(sum, s3, x_filter, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshr_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t x_filter, + const int16x8_t horiz_const) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + const int16_t *x_filter_ptr, const int im_h, int w) { + const int bd = 8; + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w == 4) { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + do { + const uint8_t *s; + int16_t *d = dst_ptr; + int width = w; + + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + __builtin_prefetch(src_ptr + 4 * src_stride); + __builtin_prefetch(src_ptr + 5 * src_stride); + __builtin_prefetch(src_ptr + 6 * src_stride); + __builtin_prefetch(src_ptr + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src_ptr + 7; + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + __builtin_prefetch(dst_ptr + 4 * dst_stride); + __builtin_prefetch(dst_ptr + 5 * dst_stride); + __builtin_prefetch(dst_ptr + 6 * dst_stride); + __builtin_prefetch(dst_ptr + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, + x_filter, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, + x_filter, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, + x_filter, horiz_const); + int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, horiz_const); + int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; + } while (height > 8); +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(src_ptr); + int16x8_t s0 = + vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + s = src_ptr + 8; + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + vst1q_s16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, + x_filter_ptr, im_h, w); + + if (clamped_y_taps == 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } +} + +static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); + const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); + uint16x4_t d1 = + vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); + uint16x4_t d2 = + vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); + uint16x4_t d3 = + vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_dist_wtd_avg_4x4( + dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, + vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); + + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + do { + const uint8_t *s = src; + CONV_BUF_TYPE *d = dst; + uint8_t *d_u8 = dst8; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); + uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); + uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); + uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, + vreinterpretq_s16_u16(round_offset_vec), + &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_copy_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); + const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); + uint16x4_t d1 = + vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); + uint16x4_t d2 = + vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); + uint16x4_t d3 = + vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + vreinterpretq_s16_u16(round_offset_vec), &d01, + &d23); + + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + do { + const uint8_t *s = src; + CONV_BUF_TYPE *d = dst; + uint8_t *d_u8 = dst8; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); + uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); + uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); + uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + vreinterpretq_s16_u16(round_offset_vec), &d0_u8, + &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src, + int src_stride, int w, int h, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); + const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); + uint16x4_t d1 = + vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); + uint16x4_t d2 = + vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); + uint16x4_t d3 = + vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + do { + const uint8_t *s = src; + CONV_BUF_TYPE *d = dst; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); + uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); + uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); + uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, + int h, ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( + src, src_stride, dst8, dst8_stride, w, h, conv_params); + } else { + dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w, + h, conv_params); + } + } else { + dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params); + } +} + +static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t x_filter, + const int16x4_t round_offset) { + int16x4_t sum = vmul_lane_s16(s0, x_filter, 0); + sum = vmla_lane_s16(sum, s1, x_filter, 1); + sum = vmla_lane_s16(sum, s2, x_filter, 2); + sum = vmla_lane_s16(sum, s3, x_filter, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpret_u16_s16(res); +} + +static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t x_filter, + const int16x8_t round_offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + __builtin_prefetch(dst8_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(dst8_ptr, d01); + + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + } while (--height != 0); + } else { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height >= 8) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, round_offset_vec); + + transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, + bck_offset, round_offset_vec, &d4_u8, &d5_u8, + &d6_u8, &d7_u8); + + store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, + d7_u8); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + dst8_ptr += 8 * dst8_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (height > 0) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + __builtin_prefetch(d); + + s += 8; + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + + s0 = s8; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + height--; + } + } +} + +static INLINE void dist_wtd_convolve_x_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + __builtin_prefetch(dst8_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(dst8_ptr, d01); + + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + } while (--height != 0); + } else { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height >= 8) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, round_offset_vec); + + transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, + round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); + + store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, + d7_u8); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + dst8_ptr += 8 * dst8_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (height > 0) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + __builtin_prefetch(d); + + s += 8; + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + + s0 = s8; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + height--; + } + } +} + +static INLINE void dist_wtd_convolve_x_neon( + const uint8_t *src, int src_stride, int w, int h, + const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, + vget_low_s16(round_offset_vec)); + + vst1_u16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height >= 8) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, round_offset_vec); + + transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (height > 0) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + __builtin_prefetch(d); + + s = src_ptr + 8; + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + + vst1q_u16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } + } +} + +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride, + w, h, filter_params_x, subpel_x_qn, + conv_params); + } else { + dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } + } else { + dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } +} + +static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, + const int16x4_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + // Filter values at indices 0 and 7 are 0. + int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); + + // We halved the convolution filter values so -1 from the right shift. + int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpret_u16_s16(res); +} + +static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, + const int16x8_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + // Filter values at indices 0 and 7 are 0. + int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2); + + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr + (5 * src_stride); + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + uint16x8_t d1 = + convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); + uint16x8_t d2 = + convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); + uint16x8_t d3 = + convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); + uint16x8_t d4 = + convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); + uint16x8_t d5 = + convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); + uint16x8_t d6 = + convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); + uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, + round_offset_vec); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, + bck_offset, round_offset_vec, &d4_u8, &d5_u8, + &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_6tap_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr + (5 * src_stride); + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + uint16x8_t d1 = + convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); + uint16x8_t d2 = + convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); + uint16x8_t d3 = + convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); + uint16x8_t d4 = + convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); + uint16x8_t d5 = + convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); + uint16x8_t d6 = + convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); + uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, + round_offset_vec); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, + round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr, + int src_stride, int w, int h, + const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + + vst1_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr + (5 * src_stride); + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + uint16x8_t d1 = + convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); + uint16x8_t d2 = + convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); + uint16x8_t d3 = + convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); + uint16x8_t d4 = + convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); + uint16x8_t d5 = + convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); + uint16x8_t d6 = + convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); + uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, + round_offset_vec); + + store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + + vst1q_u16(d, d0); + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t y_filter, + const int16x4_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpret_u16_s16(res); +} + +static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t y_filter, + const int16x8_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 0 * dst8_stride); + __builtin_prefetch(d_u8 + 1 * dst8_stride); + __builtin_prefetch(d_u8 + 2 * dst8_stride); + __builtin_prefetch(d_u8 + 3 * dst8_stride); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + __builtin_prefetch(s + 4 * src_stride); + __builtin_prefetch(s + 5 * src_stride); + __builtin_prefetch(s + 6 * src_stride); + __builtin_prefetch(s + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, + y_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter, round_offset_vec); + + __builtin_prefetch(d + 0 * dst8_stride); + __builtin_prefetch(d + 1 * dst8_stride); + __builtin_prefetch(d + 2 * dst8_stride); + __builtin_prefetch(d + 3 * dst8_stride); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, + bck_offset, round_offset_vec, &d4_u8, &d5_u8, + &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + __builtin_prefetch(d); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_8tap_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 0 * dst8_stride); + __builtin_prefetch(d_u8 + 1 * dst8_stride); + __builtin_prefetch(d_u8 + 2 * dst8_stride); + __builtin_prefetch(d_u8 + 3 * dst8_stride); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + __builtin_prefetch(s + 4 * src_stride); + __builtin_prefetch(s + 5 * src_stride); + __builtin_prefetch(s + 6 * src_stride); + __builtin_prefetch(s + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, + y_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter, round_offset_vec); + + __builtin_prefetch(d + 0 * dst8_stride); + __builtin_prefetch(d + 1 * dst8_stride); + __builtin_prefetch(d + 2 * dst8_stride); + __builtin_prefetch(d + 3 * dst8_stride); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, + round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + __builtin_prefetch(d); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr, + int src_stride, int w, int h, + const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + vget_low_s16(round_offset_vec)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + + vst1_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + __builtin_prefetch(s + 4 * src_stride); + __builtin_prefetch(s + 5 * src_stride); + __builtin_prefetch(s + 6 * src_stride); + __builtin_prefetch(s + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, + y_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter, round_offset_vec); + + store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + vst1q_u16(d, d0); + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + // Vertical filter. + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + // Filter values are even, so downshift by 1 to reduce intermediate + // precision requirements. + const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); + + const int vert_offset = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - (vert_offset * src_stride); + + if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( + src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter, + conv_params); + } else { + dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride, + dst8, dst8_stride, w, h, y_filter, + conv_params); + } + } else { + dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h, + y_filter, conv_params); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8, + dst8_stride, w, h, y_filter, + conv_params); + } else { + dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8, + dst8_stride, w, h, y_filter, + conv_params); + } + } else { + dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter, + conv_params); + } + } +} diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon.h b/third_party/aom/av1/common/arm/compound_convolve_neon.h new file mode 100644 index 0000000000..d719680a32 --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon.h @@ -0,0 +1,1164 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ + +#include + +#include "av1/common/convolve.h" +#include "av1/common/enums.h" +#include "av1/common/filter.h" + +static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x4_t round_offset, + uint8x8_t *d0_u8) { + uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); + blend0 = vmlal_n_u16(blend0, d0, bck_offset); + + uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); + + int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); + + int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); + + *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0, + const int16x4_t round_offset, + uint8x8_t *d0_u8) { + uint16x4_t avg0 = vhadd_u16(dd0, d0); + + int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); + + int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); + + *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x8_t round_offset, + uint8x8_t *d0_u8) { + uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); + blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); + uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); + blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); + + uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0, + const int16x8_t round_offset, + uint8x8_t *d0_u8) { + uint16x8_t avg0 = vhaddq_u16(dd0, d0); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_dist_wtd_avg_4x4( + uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3, + uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, + const uint16_t fwd_offset, const uint16_t bck_offset, + const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) { + uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); + blend0 = vmlal_n_u16(blend0, d0, bck_offset); + uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset); + blend1 = vmlal_n_u16(blend1, d1, bck_offset); + uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset); + blend2 = vmlal_n_u16(blend2, d2, bck_offset); + uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset); + blend3 = vmlal_n_u16(blend3, d3, bck_offset); + + uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); + uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS); + uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS); + uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS); + + int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); + int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); + + dst_01 = vsubq_s16(dst_01, round_offset); + dst_23 = vsubq_s16(dst_23, round_offset); + + *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); + *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1, + uint16x4_t dd2, uint16x4_t dd3, + uint16x4_t d0, uint16x4_t d1, + uint16x4_t d2, uint16x4_t d3, + const int16x8_t round_offset, + uint8x8_t *d01_u8, uint8x8_t *d23_u8) { + uint16x4_t avg0 = vhadd_u16(dd0, d0); + uint16x4_t avg1 = vhadd_u16(dd1, d1); + uint16x4_t avg2 = vhadd_u16(dd2, d2); + uint16x4_t avg3 = vhadd_u16(dd3, d3); + + int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); + int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); + + dst_01 = vsubq_s16(dst_01, round_offset); + dst_23 = vsubq_s16(dst_23, round_offset); + + *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); + *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_dist_wtd_avg_8x4( + uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3, + uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, + const uint16_t fwd_offset, const uint16_t bck_offset, + const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8, + uint8x8_t *d2_u8, uint8x8_t *d3_u8) { + uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); + blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); + uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); + blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); + + uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset); + blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset); + uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset); + blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset); + + uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset); + blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset); + uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset); + blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset); + + uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset); + blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset); + uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset); + blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset); + + uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); + uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS)); + uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS)); + uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS)); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); + int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); + int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); + *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); + *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); + *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1, + uint16x8_t dd2, uint16x8_t dd3, + uint16x8_t d0, uint16x8_t d1, + uint16x8_t d2, uint16x8_t d3, + const int16x8_t round_offset, + uint8x8_t *d0_u8, uint8x8_t *d1_u8, + uint8x8_t *d2_u8, uint8x8_t *d3_u8) { + uint16x8_t avg0 = vhaddq_u16(dd0, d0); + uint16x8_t avg1 = vhaddq_u16(dd1, d1); + uint16x8_t avg2 = vhaddq_u16(dd2, d2); + uint16x8_t avg3 = vhaddq_u16(dd3, d3); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); + int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); + int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); + *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); + *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); + *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint16x4_t +convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = offset_const; + // Filter values at indices 0 and 7 are 0. + sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t +convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = offset_const; + // Filter values at indices 0 and 7 are 0. + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x4_t d1 = + convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x4_t d2 = + convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x4_t d3 = + convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x4_t d1 = + convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x4_t d2 = + convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x4_t d3 = + convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_6tap_neon( + int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, + const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x4_t d1 = + convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x4_t d2 = + convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x4_t d3 = + convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + vst1_u16(dst_ptr, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + vst1q_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t +convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = offset_const; + sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t +convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = offset_const; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + offset_const); + uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + offset_const); + uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_const); + uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_const); + uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + offset_const); + uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + offset_const); + uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_const); + uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_const); + uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_8tap_neon( + int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, + const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + offset_const); + uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + offset_const); + uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + + vst1_u16(dst_ptr, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_const); + uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_const); + uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + + vst1q_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +#endif // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c new file mode 100644 index 0000000000..3aeffbb0e6 --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c @@ -0,0 +1,675 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/arm/compound_convolve_neon.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. */ + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + const int16_t *x_filter_ptr, const int im_h, int w) { + const int bd = 8; + const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2)); + // Dot product constants and other shims. + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold horiz_const into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const + + (1 << ((ROUND0_BITS - 1) - 1))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_dist_wtd_convolve_2d_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + dist_wtd_convolve_2d_horiz_neon_dotprod(src_ptr, src_stride, im_block, + im_stride, x_filter_ptr, im_h, w); + + if (clamped_y_taps == 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } +} + +static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1)); +} + +static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. */ + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + + // Dot-product constants and other shims. + const uint8x16_t range_limit = vdupq_n_u8(128); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold round_offset into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + int32x4_t correction = + vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_avg_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + + // Dot-product constants and other shims. + const uint8x16_t range_limit = vdupq_n_u8(128); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold round_offset into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + int32x4_t correction = + vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_neon_dotprod( + const uint8_t *src, int src_stride, int w, int h, + const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + + // Dot-product constants and other shims. + const uint8x16_t range_limit = vdupq_n_u8(128); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold round_offset into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + int32x4_t correction = + vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_dist_wtd_convolve_x_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( + src, src_stride, dst8, dst8_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } else { + dist_wtd_convolve_x_avg_neon_dotprod(src, src_stride, dst8, dst8_stride, + w, h, filter_params_x, subpel_x_qn, + conv_params); + } + } else { + dist_wtd_convolve_x_neon_dotprod(src, src_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } +} diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c b/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c new file mode 100644 index 0000000000..a72af9e36a --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/arm/compound_convolve_neon.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + const int16_t *x_filter_ptr, const int im_h, int w) { + const int bd = 8; + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_dist_wtd_convolve_2d_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, + x_filter_ptr, im_h, w); + + if (clamped_y_taps == 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } +} + +static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16_t permute_tbl, + const int32x4_t round_offset) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1)); +} + +static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16x3_t permute_tbl, + const int32x4_t round_offset) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t round_offset_shim = vdupq_n_s32( + (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_avg_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t round_offset_shim = vdupq_n_s32( + (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_neon_i8mm( + const uint8_t *src, int src_stride, int w, int h, + const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t round_offset_shim = vdupq_n_s32( + (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_dist_wtd_convolve_x_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm( + src, src_stride, dst8, dst8_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } else { + dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w, + h, filter_params_x, subpel_x_qn, + conv_params); + } + } else { + dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } +} diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c new file mode 100644 index 0000000000..10442f9bf9 --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon.c @@ -0,0 +1,1659 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/convolve_neon.h" + +static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x4_t s8, const int16x4_t s9, + const int16x4_t s10, const int16x4_t s11, + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t horiz_const) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = horiz_const; + sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); + + return vqrshrn_n_s32(sum, FILTER_BITS); +} + +static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16_t *x_filter_ptr) { + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + + // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right + // shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); + +#if AOM_ARCH_AARCH64 + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + s += 11; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = + convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d1 = + convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d2 = + convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d3 = + convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_0_7, x_filter_8_11, horiz_const); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(d, dst_stride, d01); + store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + +#else // !AOM_ARCH_AARCH64 + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t t0 = vld1q_u8(s); + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + + int16x4_t s0 = vget_low_s16(tt0); + int16x4_t s4 = vget_high_s16(tt0); + int16x4_t s8 = vget_low_s16(tt8); + int16x4_t s12 = vget_high_s16(tt8); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 + int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 + int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 + int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 + int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 + int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 + + int16x4_t d0 = + convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + + uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0))); + + store_u8_4x1(d, dd0); + + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +#endif // AOM_ARCH_AARCH64 +} + +static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filter, + const int16x4_t horiz_const) { + int16x4_t sum = horiz_const; + sum = vmla_lane_s16(sum, s0, filter, 0); + sum = vmla_lane_s16(sum, s1, filter, 1); + sum = vmla_lane_s16(sum, s2, filter, 2); + sum = vmla_lane_s16(sum, s3, filter, 3); + + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1); +} + +static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, + const int16x8_t horiz_const) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + return; + } + + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + src -= horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); + + if (w <= 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src += 2; + + do { + uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint8x8_t d0 = + convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const)); + + store_u8_4x1(dst, d0); + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + // Filter values are even so halve to reduce precision requirements. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (h >= 8) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int width = w; + const uint8_t *s = src + 7; + uint8_t *d = dst; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + do { + uint8x8_t t8, t9, t10, t11, t12, t13, t14; + load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); + + transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, + &t14); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + + uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const); + uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + horiz_const); + uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + horiz_const); + uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + horiz_const); + uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, + horiz_const); + uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (h-- != 0) { + uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int width = w; + const uint8_t *s = src + 8; + uint8_t *d = dst; + + __builtin_prefetch(d); + + do { + uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const); + + vst1_u8(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } + } +} + +static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter_0_7) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + // Filter values at indices 0 and 7 are 0. + int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); + + return sum; +} + +static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filters) { + const int16x4_t y_filter_lo = vget_low_s16(y_filters); + const int16x4_t y_filter_hi = vget_high_s16(y_filters); + + // Filter values at indices 0 and 7 are 0. + int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1); + sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2); + sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3); + sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0); + sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1); + sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2); + // We halved the convolution filter values so -1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16x8_t y_filter) { + if (w <= 4) { + uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + + int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); + int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter); + int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter); + + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + + int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = + vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7, t8; + load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + + uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); + uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter); + uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter); + uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); + + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); + sum = vmla_lane_s16(sum, s1, filter_lo, 1); + sum = vmla_lane_s16(sum, s2, filter_lo, 2); + sum = vmla_lane_s16(sum, s3, filter_lo, 3); + sum = vmla_lane_s16(sum, s4, filter_hi, 0); + sum = vmla_lane_s16(sum, s5, filter_hi, 1); + sum = vmla_lane_s16(sum, s6, filter_hi, 2); + sum = vmla_lane_s16(sum, s7, filter_hi, 3); + + return sum; +} + +static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16x8_t y_filter) { + if (w <= 4) { + uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + + int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = + vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x4_t s8, const int16x4_t s9, + const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, + const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + int16x4_t sum; + + sum = vmul_lane_s16(s0, y_filter_0_3, 0); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); + + sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3); + + // Saturating addition is required for the largest filter taps to avoid + // overflow (while staying in 16-bit elements.) + sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1)); + sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2)); + + return sum; +} + +static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t s8, const int16x8_t s9, + const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, + const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, y_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); + + sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3); + + // Saturating addition is required for the largest filter taps to avoid + // overflow (while staying in 16-bit elements.) + sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2)); + + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *y_filter_ptr) { + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + if (w <= 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, + &t8, &t9, &t10); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + src_ptr += 11 * src_stride; + + do { + uint8x8_t t11, t12, t13, t14; + load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14); + + int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11))); + int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12))); + int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13))); + int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14))); + + int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, y_filter_0_7, y_filter_8_11); + int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, s12, y_filter_0_7, y_filter_8_11); + int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, s13, y_filter_0_7, y_filter_8_11); + int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, y_filter_0_7, y_filter_8_11); + + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, + &t9, &t10); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + s += 11 * src_stride; + + do { + uint8x8_t t11, t12, t13, t14; + load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14); + + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + + uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, + s10, s11, y_filter_0_7, y_filter_8_11); + uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, s12, y_filter_0_7, y_filter_8_11); + uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, s13, y_filter_0_7, y_filter_8_11); + uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, y_filter_0_7, y_filter_8_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + if (w == 2 || h == 2) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, + subpel_y_qn); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int vert_offset = clamped_y_taps / 2 - 1; + + src -= vert_offset * src_stride; + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (y_filter_taps > 8) { + convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); + return; + } + + // Filter values are even so halve to reduce precision requirements. + const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); + + if (y_filter_taps < 8) { + convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); + } else { + convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); + } +} + +static INLINE int16x4_t +convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, + const int32x4_t horiz_const) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = horiz_const; + sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); + + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + const int bd = 8; + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - + // which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_const = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + +#if AOM_ARCH_AARCH64 + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + s += 11; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = + convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d1 = + convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d2 = + convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d3 = + convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_0_7, x_filter_8_11, horiz_const); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + store_s16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t t0 = vld1q_u8(s); + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + + int16x4_t s0 = vget_low_s16(tt0); + int16x4_t s4 = vget_high_s16(tt0); + int16x4_t s8 = vget_low_s16(tt1); + int16x4_t s12 = vget_high_s16(tt1); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 + int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 + int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 + int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 + int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 + int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 + + int16x4_t d0 = + convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + vst1_s16(d, d0); + + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +} + +static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filter, + const int16x4_t horiz_const) { + int16x4_t sum = horiz_const; + sum = vmla_lane_s16(sum, s0, filter, 0); + sum = vmla_lane_s16(sum, s1, filter, 1); + sum = vmla_lane_s16(sum, s2, filter, 2); + sum = vmla_lane_s16(sum, s3, filter, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshr_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, + const int16x8_t horiz_const) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride, + int16_t *im_block, int im_stride, + int w, int im_h, + const int16_t *x_filter_ptr) { + const int bd = 8; + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w <= 4) { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height > 8) { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, + x_filter, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, + x_filter, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, + x_filter, horiz_const); + int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, horiz_const); + int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + do { + uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + + vst1q_s16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w, + im_h, x_filter_0_7, x_filter_8_11); + + convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_8_11); + } else { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, + x_filter_ptr); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (clamped_y_taps <= 6) { + convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } else { + convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } + } +} + +void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + if (w <= 4) { + do { + uint8x8_t s0_0 = vld1_u8(src); + uint8x8_t s0_1 = vld1_u8(src + 1); + uint8x8_t s1_0 = vld1_u8(src + src_stride); + uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); + + uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); + uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); + + if (w == 2) { + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); + } else { + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); + } + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { + do { + uint8x8_t s0_0 = vld1_u8(src); + uint8x8_t s0_1 = vld1_u8(src + 1); + uint8x8_t s1_0 = vld1_u8(src + src_stride); + uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); + + uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); + uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); + + vst1_u8(dst, d0); + vst1_u8(dst + dst_stride, d1); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + const uint8_t *src_ptr = src; + uint8_t *dst_ptr = dst; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t s1 = vld1q_u8(src_ptr + 1); + + uint8x16_t d0 = vrhaddq_u8(s0, s1); + + vst1q_u8(dst_ptr, d0); + + src_ptr += 16; + dst_ptr += 16; + width -= 16; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } +} + +void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + + if (w <= 4) { + do { + uint8x8_t s0 = load_unaligned_u8_4x1(src); + uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride); + uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride); + + uint8x8_t d0 = vrhadd_u8(s0, s1); + uint8x8_t d1 = vrhadd_u8(s1, s2); + + if (w == 2) { + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); + } else { + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); + } + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + src_stride); + uint8x8_t s2 = vld1_u8(src + 2 * src_stride); + + uint8x8_t d0 = vrhadd_u8(s0, s1); + uint8x8_t d1 = vrhadd_u8(s1, s2); + + vst1_u8(dst, d0); + vst1_u8(dst + dst_stride, d1); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + const uint8_t *src_ptr = src; + uint8_t *dst_ptr = dst; + int height = h; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t s1 = vld1q_u8(src_ptr + src_stride); + + uint8x16_t d0 = vrhaddq_u8(s0, s1); + + vst1q_u8(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + src += 16; + dst += 16; + w -= 16; + } while (w != 0); + } +} + +void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + + uint16_t *im = im_block; + + // Horizontal filter. + if (w <= 4) { + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + + uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1)); + + // Safe to store the whole vector, the im buffer is big enough. + vst1_u16(im, sum); + + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } else { + do { + const uint8_t *src_ptr = src; + uint16_t *im_ptr = im; + int width = w; + + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + 1); + + uint16x8_t sum = vaddl_u8(s0, s1); + + vst1q_u16(im_ptr, sum); + + src_ptr += 8; + im_ptr += 8; + width -= 8; + } while (width != 0); + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } + + im = im_block; + + // Vertical filter. + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(im); + uint16x4_t s1 = vld1_u16(im + im_stride); + uint16x4_t s2 = vld1_u16(im + 2 * im_stride); + + uint16x4_t sum0 = vadd_u16(s0, s1); + uint16x4_t sum1 = vadd_u16(s1, s2); + + uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2); + uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2); + + if (w == 2) { + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); + } else { + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); + } + + im += 2 * im_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + uint16_t *im_ptr = im; + uint8_t *dst_ptr = dst; + int height = h; + + do { + uint16x8_t s0 = vld1q_u16(im_ptr); + uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); + + uint16x8_t sum = vaddq_u16(s0, s1); + uint8x8_t d0 = vqrshrn_n_u16(sum, 2); + + vst1_u8(dst_ptr, d0); + + im_ptr += im_stride; + dst_ptr += dst_stride; + } while (--height != 0); + im += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h new file mode 100644 index 0000000000..9fbf8aa12f --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon.h @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +static INLINE int32x4_t +convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); + + return sum; +} + +static INLINE uint8x8_t +convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, + const int16x8_t sub_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_12tap_neon( + int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, + int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + if (w <= 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, + &s8, &s9, &s10); + src_ptr += 11 * src_stride; + + do { + int16x4_t s11, s12, s13, s14; + load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14); + + int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, + s10, s11, y_filter_0_7, y_filter_8_11); + int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, s12, y_filter_0_7, y_filter_8_11); + int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, s13, y_filter_0_7, y_filter_8_11); + int32x4_t d3 = + convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + y_filter_0_7, y_filter_8_11); + + int16x8_t dd01 = + vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS)); + int16x8_t dd23 = + vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS)); + + dd01 = vsubq_s16(dd01, sub_const); + dd23 = vsubq_s16(dd23, sub_const); + + uint8x8_t d01 = vqmovun_s16(dd01); + uint8x8_t d23 = vqmovun_s16(dd23); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x8_t s11, s12, s13, s14; + load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint8x8_t d0 = + convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + y_filter_0_7, y_filter_8_11, sub_const); + uint8x8_t d1 = + convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + y_filter_0_7, y_filter_8_11, sub_const); + uint8x8_t d2 = + convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, y_filter_0_7, y_filter_8_11, sub_const); + uint8x8_t d3 = + convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, y_filter_0_7, y_filter_8_11, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t y_filter) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t y_filter, + const int16x8_t sub_const) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16x8_t y_filter) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + if (w <= 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + int16x4_t d3 = + convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint8x8_t d01 = + vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, sub_const); + uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, sub_const); + uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, sub_const); + uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, sub_const); + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2); + + return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, + const int16x8_t sub_const) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16x8_t y_filter) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + if (w <= 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); + int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter); + int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); + uint8x8_t d01 = + vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint8x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); + uint8x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const); + uint8x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const); + uint8x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + uint8x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c new file mode 100644 index 0000000000..c29229eb09 --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c @@ -0,0 +1,793 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve12_4_x(uint8x16_t samples, + const int8x16_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0); + sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1); + sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2); + + return vqrshrn_n_s32(sum, FILTER_BITS); +} + +static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2], + const int8x16_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples[2], permuted_samples[4]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit)); + clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2); + // Second 4 output values. + sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS), + vqrshrn_n_s32(sum[1], FILTER_BITS)); + return vqmovun_s16(sum_s16); +} + +static INLINE void convolve_x_sr_12tap_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter_ptr) { + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); + const int8x16_t filter = + vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); + + const int32_t correction_s32 = + vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)), + vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS)))); + // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right + // shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1))); + const uint8x16_t range_limit = vdupq_n_u8(128); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + // Special case the following no-op filter as 128 won't fit into the + // 8-bit signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(filter_0_7, 5) == 128) { + // Undo the horizontal offset in the calling function. + src += 5; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x8_t d0 = vld1_u8(s); + if (w == 4) { + store_u8_4x1(d, d0); + } else { + vst1_u8(d, d0); + } + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve12_4_x(s0, filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve12_4_x(s1, filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve12_4_x(s2, filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve12_4_x(s3, filter, correction, range_limit, permute_tbl); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = + convolve12_8_x(s0, filter, correction, range_limit, permute_tbl); + uint8x8_t d1 = + convolve12_8_x(s1, filter, correction, range_limit, permute_tbl); + uint8x8_t d2 = + convolve12_8_x(s2, filter, correction, range_limit, permute_tbl); + uint8x8_t d3 = + convolve12_8_x(s3, filter, correction, range_limit, permute_tbl); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0); + + // Packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. */ + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1])); + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); +} + +void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + return; + } + + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + src -= horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + // Dot product constants. + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + // We halved the convolution filter values so - 1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint8x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint8x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint8x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, + const int8x16_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0); + sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1); + sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2); + + // Narrow and re-pack. + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], + const int8x16_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples[2], permuted_samples[4]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit)); + clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2); + // Second 4 output values. + sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2); + + // Narrow and re-pack. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS), + vshrn_n_s32(sum[1], ROUND0_BITS)); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + const int bd = 8; + + // Special case the following no-op filter as 128 won't fit into the 8-bit + // signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { + const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); + // Undo the horizontal offset in the calling function. + src_ptr += 5; + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0 = vld1_u8(s); + uint16x8_t d0 = vaddw_u8(horiz_const, s0); + d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); + // Store 8 elements to avoid additional branches. This is safe if the + // actual block width is < 8 because the intermediate buffer is large + // enough to accommodate 128x128 blocks. + vst1q_s16(d, vreinterpretq_s16_u16(d0)); + + d += 8; + s += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Dot product constants. + const int32x4_t correct_tmp = + vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)), + vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7))); + const int32x4_t correction = + vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const); + const uint8x16_t range_limit = vdupq_n_u8(128); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, + range_limit, permute_tbl); + int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, + range_limit, permute_tbl); + int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, + range_limit, permute_tbl); + int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, + range_limit, permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, + range_limit, permute_tbl); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, + int im_h, const int16_t *x_filter_ptr) { + const int bd = 8; + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The outermost -1 is needed because we halved the filter values. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); + // Dot product constants. + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const); + const uint8x16_t range_limit = vdupq_n_u8(128); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_0_7, + x_filter_8_11); + + convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_8_11); + } else { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride, + w, im_h, x_filter_ptr); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (clamped_y_taps <= 6) { + convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } else { + convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } + } +} diff --git a/third_party/aom/av1/common/arm/convolve_neon_i8mm.c b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c new file mode 100644 index 0000000000..bbcd6f201a --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve12_4_x(uint8x16_t samples, + const int8x16_t filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0); + sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1); + sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2); + + return vqrshrn_n_s32(sum, FILTER_BITS); +} + +static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2], + const int8x16_t filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[4]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2); + // Second 4 output values. + sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS), + vqrshrn_n_s32(sum[1], FILTER_BITS)); + return vqmovun_s16(sum_s16); +} + +static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src, + int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter_ptr) { + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); + const int8x16_t filter = + vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); + + // Special case the following no-op filter as 128 won't fit into the + // 8-bit signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(filter_0_7, 5) == 128) { + // Undo the horizontal offset in the calling function. + src += 5; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x8_t d0 = vld1_u8(s); + if (w == 4) { + store_u8_4x1(d, d0); + } else { + vst1_u8(d, d0); + } + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0); + + // Packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1); + + int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1])); + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); +} + +void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + return; + } + + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + src -= horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1)); + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const); + + // We halved the convolution filter values so - 1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, + const int8x16_t filters, + const uint8x16x3_t permute_tbl, + int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0); + sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1); + sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2); + + // Narrow and re-pack. + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], + const int8x16_t filters, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[4]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2); + // Second 4 output values. + sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2); + + // Narrow and re-pack. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS), + vshrn_n_s32(sum[1], ROUND0_BITS)); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + const int bd = 8; + + // Special case the following no-op filter as 128 won't fit into the + // 8-bit signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { + const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); + // Undo the horizontal offset in the calling function. + src_ptr += 5; + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0 = vld1_u8(s); + uint16x8_t d0 = vaddw_u8(horiz_const, s0); + d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); + // Store 8 elements to avoid additional branches. This is safe if the + // actual block width is < 8 because the intermediate buffer is large + // enough to accommodate 128x128 blocks. + vst1q_s16(d, vreinterpretq_s16_u16(d0)); + + d += 8; + s += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_const = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = + convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = + convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = + convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = + convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = + convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = + convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = + convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_neon_i8mm( + const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, + int im_h, const int16_t *x_filter_ptr) { + const int bd = 8; + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The outermost -1 is needed because we halved the filter values. + const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_0_7, + x_filter_8_11); + + convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_8_11); + } else { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w, + im_h, x_filter_ptr); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (clamped_y_taps <= 6) { + convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } else { + convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } + } +} diff --git a/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c new file mode 100644 index 0000000000..fc03a2ee04 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c @@ -0,0 +1,2031 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + +static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr, + int src_stride, uint16_t *dst_ptr, + int dst_stride, int w, int h, + ConvolveParams *conv_params, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_12_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), + vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), + vqrshrun_n_s32(d1, ROUND_SHIFT)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE uint16x4_t highbd_12_convolve6_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); + + return vqshrun_n_s32(sum, ROUND0_BITS + 2); +} + +static INLINE uint16x4_t +highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_12_convolve6_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), + vqshrun_n_s32(sum1, ROUND0_BITS + 2)); +} + +static INLINE uint16x8_t +highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); + + return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS)); +} + +static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE void highbd_dist_wtd_convolve_x_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE uint16x4_t highbd_12_convolve8_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, + const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS + 2); +} + +static INLINE uint16x4_t +highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter, const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_12_convolve8_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, + const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), + vqshrun_n_s32(sum1, ROUND0_BITS + 2)); +} + +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), + vqshrun_n_s32(sum1, ROUND0_BITS)); +} + +static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + return vqshrun_n_s32(sum, 5); +} + +static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE void highbd_12_dist_wtd_convolve_x_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 2); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], + s0[6], s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], + s1[6], s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], + s2[6], s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], + s3[6], s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_x_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 2); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], + s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], + s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], + s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], + s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_dist_wtd_convolve_x_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int horiz_offset = filter_params_x->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int offset_avg = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int offset_convolve = (1 << (conv_params->round_0 - 1)) + + (1 << (bd + FILTER_BITS)) + + (1 << (bd + FILTER_BITS - 1)); + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + // horizontal filter + if (bd == 12) { + if (conv_params->do_average) { + if (x_filter_taps <= 6 && w != 4) { + highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, + im_stride, w, h, x_filter_ptr, + offset_convolve); + } else { + highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, + w, h, x_filter_ptr, offset_convolve); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, offset_avg, bd); + } else { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, offset_avg, bd); + } + } else { + if (x_filter_taps <= 6 && w != 4) { + highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, + dst16_stride, w, h, + x_filter_ptr, offset_convolve); + } else { + highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, + w, h, x_filter_ptr, offset_convolve); + } + } + } else { + if (conv_params->do_average) { + if (x_filter_taps <= 6 && w != 4) { + highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, + im_stride, w, h, x_filter_ptr, + offset_convolve); + } else { + highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w, + h, x_filter_ptr, offset_convolve); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, offset_avg, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, offset_avg, bd); + } + } else { + if (x_filter_taps <= 6 && w != 4) { + highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, + dst16_stride, w, h, x_filter_ptr, + offset_convolve); + } else { + highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w, + h, x_filter_ptr, offset_convolve); + } + } + } +} + +static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d1 = + highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x4_t d2 = + highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x4_t d3 = + highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_y_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d1 = + highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x4_t d2 = + highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x4_t d3 = + highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} +static INLINE void highbd_dist_wtd_convolve_y_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_highbd_dist_wtd_convolve_y_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = filter_params_y->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_offset_conv = (1 << (conv_params->round_0 - 1)) + + (1 << (bd + FILTER_BITS)) + + (1 << (bd + FILTER_BITS - 1)); + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (bd == 12) { + if (conv_params->do_average) { + if (y_filter_taps <= 6) { + highbd_12_dist_wtd_convolve_y_6tap_neon( + src + src_stride, src_stride, im_block, im_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, + im_stride, w, h, y_filter_ptr, + round_offset_conv); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, round_offset_avg, + bd); + } else { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } + } else { + if (y_filter_taps <= 6) { + highbd_12_dist_wtd_convolve_y_6tap_neon( + src + src_stride, src_stride, dst16, dst16_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_12_dist_wtd_convolve_y_8tap_neon( + src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv); + } + } + } else { + if (conv_params->do_average) { + if (y_filter_taps <= 6) { + highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, + im_block, im_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, + im_stride, w, h, y_filter_ptr, + round_offset_conv); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, round_offset_avg, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } + } else { + if (y_filter_taps <= 6) { + highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, + dst16, dst16_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16, + dst16_stride, w, h, y_filter_ptr, + round_offset_conv); + } + } + } +} + +static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, int w, + int h, const int round_bits, + const int offset) { + if (w <= 4) { + const int16x4_t round_shift_s16 = vdup_n_s16(round_bits); + const uint16x4_t offset_u16 = vdup_n_u16(offset); + + for (int y = 0; y < h; ++y) { + const uint16x4_t s = vld1_u16(src_ptr + y * src_stride); + uint16x4_t d = vshl_u16(s, round_shift_s16); + d = vadd_u16(d, offset_u16); + if (w == 2) { + store_u16_2x1(dst_ptr + y * dst_stride, d); + } else { + vst1_u16(dst_ptr + y * dst_stride, d); + } + } + } else { + const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits); + const uint16x8_t offset_u16 = vdupq_n_u16(offset); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; x += 8) { + const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x); + uint16x8_t d = vshlq_u16(s, round_shift_s16); + d = vaddq_u16(d, offset_u16); + vst1q_u16(dst_ptr + y * dst_stride + x, d); + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src, + int src_stride, uint16_t *dst, + int dst_stride, int w, int h, + ConvolveParams *conv_params, + int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + + const int im_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + + if (conv_params->do_average) { + highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits, + round_offset); + } else { + highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits, + round_offset); + } + + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + if (bd == 12) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, round_offset, bd); + } else { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, round_offset, bd); + } + } else { + if (bd == 12) { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset, bd); + } + } + } +} + +static INLINE uint16x4_t highbd_convolve6_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve6_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d1 = + highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x4_t d2 = + highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x4_t d3 = + highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, + y_filter, offset_vec); + uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, + y_filter, offset_vec); + uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w <= 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6]; + load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); + + uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6]; + load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); + + uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 1); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[4]; + load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], + s0[6], s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], + s1[6], s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], + s2[6], s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], + s3[6], s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + + uint16x8_t d0 = + highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], + s0[6], s0[7], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 1); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[4]; + load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], + s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], + s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], + s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], + s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + + uint16x8_t d0 = + highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], + s0[7], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_highbd_dist_wtd_convolve_2d_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint16_t, + im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a + // faster non-rounding non-saturating left shift. + const int round_offset_conv_x = + (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset_conv_y = (1 << y_offset_bits); + const int round_offset_avg = + ((1 << (y_offset_bits - conv_params->round_1)) + + (1 << (y_offset_bits - conv_params->round_1 - 1))); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + // horizontal filter + if (bd == 12) { + if (x_filter_taps <= 6 && w != 4) { + highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } else { + highbd_12_dist_wtd_convolve_2d_horiz_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } + } else { + if (x_filter_taps <= 6 && w != 4) { + highbd_dist_wtd_convolve_2d_horiz_6tap_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } else { + highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } + } + + // vertical filter + if (y_filter_taps <= 6) { + if (conv_params->do_average) { + highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2, + im_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } else { + highbd_dist_wtd_convolve_2d_vert_6tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } + } else { + if (conv_params->do_average) { + highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2, + im_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } else { + highbd_dist_wtd_convolve_2d_vert_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } + } + + // Do the compound averaging outside the loop, avoids branching within the + // main loop + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + if (bd == 12) { + highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, + w, h, conv_params, round_offset_avg, + bd); + } else { + highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, + h, conv_params, round_offset_avg, bd); + } + } else { + if (bd == 12) { + highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } else { + highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } + } + } +} diff --git a/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c new file mode 100644 index 0000000000..4f1c25d122 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +#define UPSCALE_NORMATIVE_TAPS 8 + +void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + int x0_qn, int x_step_qn, int bd) { + const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1; + + static const int32_t kIdx[4] = { 0, 1, 2, 3 }; + const int32x4_t idx = vld1q_s32(kIdx); + const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK); + const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS); + const int32x4_t offset_s32 = vdupq_n_s32(0); + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + const uint16_t *src_ptr = src - horiz_offset; + uint16_t *dst_ptr = dst; + + if (w <= 4) { + int height = h; + uint16_t *d = dst_ptr; + + do { + int x_qn = x0_qn; + + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) = + // 2 + const int32x4_t src_idx = + vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1); + // Similarly for the filter vector indices, we calculate the filter + // indices for 4 columns. First we calculate the indices: + // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS + // Then we calculate the actual pointers, multiplying with + // UPSCALE_UPSCALE_NORMATIVE_TAPS + // again shift left by 1 + const int32x4_t x_filter4_idx = vshlq_n_s32( + vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1); + // Even though pointers are unsigned 32/64-bit ints we do signed + // addition The reason for this is that x_qn can be negative, leading to + // negative offsets. Argon test + // profile0_core/streams/test10573_11003.obu was failing because of + // this. +#if AOM_ARCH_AARCH64 + uint64x2_t tmp4[2]; + tmp4[0] = vreinterpretq_u64_s64(vaddw_s32( + vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx))); + tmp4[1] = vreinterpretq_u64_s64(vaddw_s32( + vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx))); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); + + // filter vectors + tmp4[0] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + tmp4[1] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint64_t *)&x_filter4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); +#else + uint32x4_t tmp4; + tmp4 = vreinterpretq_u32_s32( + vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx)); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, tmp4); + + // filter vectors + tmp4 = vreinterpretq_u32_s32( + vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx, + vdupq_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint32_t *)&x_filter4_ptr; + vst1q_u32(tmp_ptr, tmp4); +#endif // AOM_ARCH_AARCH64 + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + d0 = vmin_u16(d0, max); + + if (w == 2) { + store_u16_2x1(d, d0); + } else { + vst1_u16(d, d0); + } + + src_ptr += src_stride; + d += dst_stride; + height--; + } while (height > 0); + } else { + int height = h; + + do { + int width = w; + int x_qn = x0_qn; + uint16_t *d = dst_ptr; + const uint16_t *s = src_ptr; + + do { + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const int32x4_t xqn_idx = + vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) + // = 2 + const int32x4_t src_idx = + vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1); + + // Similarly for the filter vector indices, we calculate the filter + // indices for 4 columns. First we calculate the indices: + // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS + // Then we calculate the actual pointers, multiplying with + // UPSCALE_UPSCALE_NORMATIVE_TAPS + // again shift left by 1 + const int32x4_t x_filter4_idx = vshlq_n_s32( + vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), + 1); + // Even though pointers are unsigned 32/64-bit ints we do signed + // addition The reason for this is that x_qn can be negative, leading to + // negative offsets. Argon test + // profile0_core/streams/test10573_11003.obu was failing because of + // this. +#if AOM_ARCH_AARCH64 + uint64x2_t tmp4[2]; + tmp4[0] = vreinterpretq_u64_s64( + vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx))); + tmp4[1] = vreinterpretq_u64_s64( + vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx))); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); + + // filter vectors + tmp4[0] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + tmp4[1] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint64_t *)&x_filter4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); +#else + uint32x4_t tmp4; + tmp4 = vreinterpretq_u32_s32( + vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx)); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, tmp4); + + // filter vectors + tmp4 = vreinterpretq_u32_s32( + vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx, + vdupq_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint32_t *)&x_filter4_ptr; + vst1q_u32(tmp_ptr, tmp4); +#endif // AOM_ARCH_AARCH64 + + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale X convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + d0 = vmin_u16(d0, max); + vst1_u16(d, d0); + + x_qn += 4 * x_step_qn; + d += 4; + width -= 4; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } while (height > 0); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_neon.c new file mode 100644 index 0000000000..3a3e33fcba --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_neon.c @@ -0,0 +1,2120 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +static INLINE uint16x4_t +highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t +highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_convolve_y_sr_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + + if (w == 4) { + const int16_t *s = (const int16_t *)(src_ptr + src_stride); + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7); + uint16x4_t d1 = + highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7); + uint16x4_t d2 = + highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7); + uint16x4_t d3 = + highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + const int16_t *s = (const int16_t *)(src_ptr + src_stride); + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7); + uint16x8_t d1 = + highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7); + uint16x8_t d2 = + highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7); + uint16x8_t d3 = + highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_y( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_y( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_convolve_y_sr_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint16x4_t d1 = + highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint16x4_t d2 = + highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint16x4_t d3 = + highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint16x8_t d1 = + highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint16x8_t d2 = + highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint16x8_t d3 = + highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_y( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve12_8_y( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_convolve_y_sr_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x4_t s11, s12, s13, s14; + load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x4_t d0 = + highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, y_filter_0_7, y_filter_8_11); + uint16x4_t d1 = + highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, y_filter_0_7, y_filter_8_11); + uint16x4_t d2 = + highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, y_filter_0_7, y_filter_8_11); + uint16x4_t d3 = + highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, y_filter_0_7, y_filter_8_11); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x8_t s11, s12, s13, s14; + load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x8_t d0 = + highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, y_filter_0_7, y_filter_8_11); + uint16x8_t d1 = + highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, y_filter_0_7, y_filter_8_11); + uint16x8_t d2 = + highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, y_filter_0_7, y_filter_8_11); + uint16x8_t d3 = + highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, y_filter_0_7, y_filter_8_11); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int vert_offset = filter_params_y->taps / 2 - 1; + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (y_filter_taps > 8) { + highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); + return; + } + if (y_filter_taps < 8) { + highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); + return; + } + + highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); +} + +static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6], + const int16x8_t x_filter, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = offset; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); + + int32x4_t sum1 = offset; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE void highbd_convolve_x_sr_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + int bd) { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset); + uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset); + uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset); + uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t offset) { + int32x4_t sum = offset; + sum = vmlal_lane_s16(sum, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + return vqrshrun_n_s32(sum, FILTER_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8], + const int16x8_t x_filter, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = offset; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + + int32x4_t sum1 = offset; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr, + int src_stride, uint16_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *x_filter_ptr, + ConvolveParams *conv_params, + int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 2); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset); + uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset); + uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset); + uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset); + uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset); + uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset); + uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = offset; + sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); + + return vqrshrun_n_s32(sum, FILTER_BITS); +} + +static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum0 = offset; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); + + int32x4_t sum1 = offset; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE void highbd_convolve_x_sr_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[12], s1[12], s2[12], s3[12]; + load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x4_t d0 = + highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset); + uint16x4_t d1 = + highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset); + uint16x4_t d2 = + highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset); + uint16x4_t d3 = + highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[12], s1[12], s2[12], s3[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x8_t d0 = + highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset); + uint16x8_t d1 = + highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset); + uint16x8_t d2 = + highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset); + uint16x8_t d3 = + highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, bd); + return; + } + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + if (x_filter_taps > 8) { + highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + if (x_filter_taps <= 6 && w != 4) { + highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + + highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); +} + +static INLINE uint16x4_t highbd_convolve6_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, const int32x4_t round_shift, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + sum = vshlq_s32(sum, round_shift); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve6_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, const int32x4_t round_shift, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + sum0 = vshlq_s32(sum0, round_shift); + sum1 = vshlq_s32(sum1, round_shift); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_vert_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, + int bd, const int offset) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + const int round1_shift = conv_params->round_1; + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, + round1_shift_s32, offset_s32); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = highbd_convolve6_8_2d_v( + s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32); + uint16x8_t d1 = highbd_convolve6_8_2d_v( + s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32); + uint16x8_t d2 = highbd_convolve6_8_2d_v( + s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32); + uint16x8_t d3 = highbd_convolve6_8_2d_v( + s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + sum = vshlq_s32(sum, round_shift); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); + + sum0 = vshlq_s32(sum0, round_shift); + sum1 = vshlq_s32(sum1, round_shift); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_vert_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, + int bd, const int offset) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + const int round1_shift = conv_params->round_1; + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d1 = + highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d2 = + highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d3 = + highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round1_shift_s32, offset_s32); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round1_shift_s32, offset_s32); + uint16x8_t d1 = + highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round1_shift_s32, offset_s32); + uint16x8_t d2 = + highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round1_shift_s32, offset_s32); + uint16x8_t d3 = + highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round1_shift_s32, offset_s32); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); + + sum = vshlq_s32(sum, round_shift); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve12_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); + + sum0 = vshlq_s32(sum0, round_shift); + sum1 = vshlq_s32(sum1, round_shift); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_vert_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, + const int bd, const int offset) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + const int round1_shift = conv_params->round_1; + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x4_t s11, s12, s13, s14; + load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x4_t d0 = highbd_convolve12_4_2d_v( + s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x4_t d1 = highbd_convolve12_4_2d_v( + s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x4_t d2 = highbd_convolve12_4_2d_v( + s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x4_t d3 = highbd_convolve12_4_2d_v( + s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x8_t s11, s12, s13, s14; + load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x8_t d0 = highbd_convolve12_8_2d_v( + s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x8_t d1 = highbd_convolve12_8_2d_v( + s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x8_t d2 = highbd_convolve12_8_2d_v( + s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x8_t d3 = highbd_convolve12_8_2d_v( + s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6], + const int16x8_t x_filter, + const int32x4_t shift_s32, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); + + sum0 = vqrshlq_s32(sum0, shift_s32); + sum1 = vqrshlq_s32(sum1, shift_s32); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + const int offset) { + // The smallest block height processed by the SIMD functions is 4, and the + // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines + // for the vertical convolution. + assert(h >= 5); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = + highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); + uint16x8_t d1 = + highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32); + uint16x8_t d2 = + highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32); + uint16x8_t d3 = + highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6]; + load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); + + uint16x8_t d0 = + highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t shift_s32, + const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8], + const int16x8_t x_filter, + const int32x4_t shift_s32, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + + sum0 = vqrshlq_s32(sum0, shift_s32); + sum1 = vqrshlq_s32(sum1, shift_s32); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + const int offset) { + // The smallest block height processed by the SIMD functions is 4, and the + // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines + // for the vertical convolution. + assert(h >= 5); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 1); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = + highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); + uint16x4_t d1 = + highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32); + uint16x4_t d2 = + highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32); + uint16x4_t d3 = + highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[4]; + load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + + uint16x4_t d0 = + highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); + + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); + uint16x8_t d1 = + highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32); + uint16x8_t d2 = + highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32); + uint16x8_t d3 = + highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + + uint16x8_t d0 = + highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t shift_s32, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t shift_s32, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); + + sum0 = vqrshlq_s32(sum0, shift_s32); + sum1 = vqrshlq_s32(sum1, shift_s32); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + const int offset) { + // The smallest block height processed by the SIMD functions is 4, and the + // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines + // for the vertical convolution. + assert(h >= 5); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[12], s1[12], s2[12], s3[12]; + load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[12]; + load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], + &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); + + uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[12], s1[12], s2[12], s3[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x8_t d0 = highbd_convolve12_8_2d_h( + s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + uint16x8_t d1 = highbd_convolve12_8_2d_h( + s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + uint16x8_t d2 = highbd_convolve12_8_2d_h( + s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + uint16x8_t d3 = highbd_convolve12_8_2d_h( + s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + + uint16x8_t d0 = highbd_convolve12_8_2d_h( + s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params, bd); + return; + } + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + const int x_offset_initial = (1 << (bd + FILTER_BITS - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a + // simple shift left instead of a rounding saturating shift left. + const int y_offset = + (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (x_filter_taps > 8) { + highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset_initial); + + highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + return; + } + if (x_filter_taps <= 6 && w != 4) { + highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset_initial); + } else { + highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, + w, im_h, x_filter_ptr, conv_params, + x_offset_initial); + } + + if (y_filter_taps <= 6) { + highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } else { + highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } +} + +// Filter used is [64, 64]. +void av1_highbd_convolve_x_sr_intrabc_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + (void)bd; + + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(src); + uint16x4_t s1 = vld1_u16(src + 1); + + uint16x4_t d0 = vrhadd_u16(s0, s1); + + if (w == 2) { + store_u16_2x1(dst, d0); + } else { + vst1_u16(dst, d0); + } + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + int width = w; + + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 1); + + uint16x8_t d0 = vrhaddq_u16(s0, s1); + + vst1q_u16(dst_ptr, d0); + + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } +} + +// Filter used is [64, 64]. +void av1_highbd_convolve_y_sr_intrabc_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + int bd) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + (void)bd; + + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(src); + uint16x4_t s1 = vld1_u16(src + src_stride); + + uint16x4_t d0 = vrhadd_u16(s0, s1); + + if (w == 2) { + store_u16_2x1(dst, d0); + } else { + vst1_u16(dst, d0); + } + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + int height = h; + + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + src_stride); + + uint16x8_t d0 = vrhaddq_u16(s0, s1); + + vst1q_u16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +// Both horizontal and vertical passes use the same 2-tap filter: [64, 64]. +void av1_highbd_convolve_2d_sr_intrabc_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + (void)bd; + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + 1; + int im_stride = MAX_SB_SIZE; + + uint16x8_t vert_offset = vdupq_n_u16(1); + + uint16_t *im = im_block; + + // Horizontal filter. + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(src); + uint16x4_t s1 = vld1_u16(src + 1); + + uint16x4_t d0 = vadd_u16(s0, s1); + + // Safe to store the whole vector, the im buffer is big enough. + vst1_u16(im, d0); + + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } else { + do { + const uint16_t *src_ptr = src; + uint16_t *im_ptr = im; + int width = w; + + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 1); + + uint16x8_t d0 = vaddq_u16(s0, s1); + + vst1q_u16(im_ptr, d0); + + src_ptr += 8; + im_ptr += 8; + width -= 8; + } while (width != 0); + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } + + im = im_block; + + // Vertical filter. + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(im); + uint16x4_t s1 = vld1_u16(im + im_stride); + + uint16x4_t d0 = vhadd_u16(s0, s1); + d0 = vhadd_u16(d0, vget_low_u16(vert_offset)); + + if (w == 2) { + store_u16_2x1(dst, d0); + } else { + vst1_u16(dst, d0); + } + + im += im_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint16_t *im_ptr = im; + uint16_t *dst_ptr = dst; + int height = h; + + do { + uint16x8_t s0 = vld1q_u16(im_ptr); + uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); + + uint16x8_t d0 = vhaddq_u16(s0, s1); + d0 = vhaddq_u16(d0, vert_offset); + + vst1q_u16(dst_ptr, d0); + + im_ptr += im_stride; + dst_ptr += dst_stride; + } while (--height != 0); + im += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_convolve_neon.h b/third_party/aom/av1/common/arm/highbd_convolve_neon.h new file mode 100644 index 0000000000..08b2bda4e5 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_neon.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ + +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/convolve.h" + +static INLINE int32x4_t highbd_convolve8_4_s32( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t offset) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + return sum; +} + +static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t shift_s32, const int32x4_t offset) { + int32x4_t sum = + highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +// Like above but also perform round shifting and subtract correction term +static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset, + const int32x4_t correction) { + int32x4_t sum = + highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset); + + sum = vsubq_s32(vqrshlq_s32(sum, round_shift), correction); + return vqmovun_s32(sum); +} + +static INLINE void highbd_convolve8_8_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3); + + *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3); +} + +// Like above but also perform round shifting and subtract correction term +static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset, + const int32x4_t correction) { + int32x4_t sum0; + int32x4_t sum1; + highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset, + &sum0, &sum1); + + sum0 = vsubq_s32(vqrshlq_s32(sum0, round_shift), correction); + sum1 = vsubq_s32(vqrshlq_s32(sum1, round_shift), correction); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t *filters_lo, + const int16x4_t *filters_hi, const int32x4_t offset) { + int16x4_t s_lo[] = { vget_low_s16(s0), vget_low_s16(s1), vget_low_s16(s2), + vget_low_s16(s3) }; + int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2), + vget_high_s16(s3) }; + + transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi); + + int32x4_t sum = vmlal_s16(offset, s_lo[0], filters_lo[0]); + sum = vmlal_s16(sum, s_lo[1], filters_lo[1]); + sum = vmlal_s16(sum, s_lo[2], filters_lo[2]); + sum = vmlal_s16(sum, s_lo[3], filters_lo[3]); + sum = vmlal_s16(sum, s_hi[0], filters_hi[0]); + sum = vmlal_s16(sum, s_hi[1], filters_hi[1]); + sum = vmlal_s16(sum, s_hi[2], filters_hi[2]); + sum = vmlal_s16(sum, s_hi[3], filters_hi[3]); + + return sum; +} + +static INLINE uint16x4_t highbd_convolve8_2d_scale_horiz4x8_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t *filters_lo, + const int16x4_t *filters_hi, const int32x4_t shift_s32, + const int32x4_t offset) { + int32x4_t sum = highbd_convolve8_2d_scale_horiz4x8_s32( + s0, s1, s2, s3, filters_lo, filters_hi, offset); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c new file mode 100644 index 0000000000..702c651536 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +static INLINE void highbd_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int round_bits, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const int32x4_t round_shift = vdupq_n_s32(-round_bits); + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w <= 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + d0 = vqrshlq_s32(d0, round_shift); + + uint16x4_t d0_u16 = vqmovun_s32(d0); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + if (w == 2) { + store_u16_2x1(dst_ptr, d0_u16); + } else { + vst1_u16(dst_ptr, d0_u16); + } + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + d0 = vqrshlq_s32(d0, round_shift); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + d1 = vqrshlq_s32(d1, round_shift); + + uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + const int round_bits, const int offset, + const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const int32x4_t round_shift = vdupq_n_s32(-round_bits); + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w <= 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + d0 = vqrshlq_s32(d0, round_shift); + + uint16x4_t d0_u16 = vqmovun_s32(d0); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + if (w == 2) { + store_u16_2x1(dst_ptr, d0_u16); + } else { + vst1_u16(dst_ptr, d0_u16); + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + d0_lo = vqrshlq_s32(d0_lo, round_shift); + d0_hi = vqrshlq_s32(d0_hi, round_shift); + + uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_convolve_2d_x_scale_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int subpel_x_qn, const int x_step_qn, + const InterpFilterParams *filter_params, ConvolveParams *conv_params, + const int offset) { + static const uint32_t kIdx[4] = { 0, 1, 2, 3 }; + const uint32x4_t idx = vld1q_u32(kIdx); + const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + if (w <= 4) { + int height = h; + uint16_t *d = dst_ptr; + + do { + int x_qn = subpel_x_qn; + + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) = + // 2 + const uint32x4_t src_idx_u32 = + vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1); +#if AOM_ARCH_AARCH64 + uint64x2_t src4[2]; + src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr), + vget_low_u32(src_idx_u32)); + src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr), + vget_high_u32(src_idx_u32)); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, src4[0]); + vst1q_u64(tmp_ptr + 2, src4[1]); +#else + uint32x4_t src4; + src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, src4); +#endif // AOM_ARCH_AARCH64 + // Same for the filter vectors + const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32( + vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS)); + int32_t x_filter4_idx[4]; + vst1q_s32(x_filter4_idx, filter_idx_s32); + const int16_t *x_filter4_ptr[4]; + + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // We could easily do this using SIMD as well instead of calling the + // inline function 4 times. + x_filter4_ptr[0] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]); + x_filter4_ptr[1] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]); + x_filter4_ptr[2] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]); + x_filter4_ptr[3] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + if (w == 2) { + store_u16_2x1(d, d0); + } else { + vst1_u16(d, d0); + } + + src_ptr += src_stride; + d += dst_stride; + height--; + } while (height > 0); + } else { + int height = h; + + do { + int width = w; + int x_qn = subpel_x_qn; + uint16_t *d = dst_ptr; + const uint16_t *s = src_ptr; + + do { + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const uint32x4_t xqn_idx = + vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) + // = 2 + const uint32x4_t src_idx_u32 = + vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1); +#if AOM_ARCH_AARCH64 + uint64x2_t src4[2]; + src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s), + vget_low_u32(src_idx_u32)); + src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s), + vget_high_u32(src_idx_u32)); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, src4[0]); + vst1q_u64(tmp_ptr + 2, src4[1]); +#else + uint32x4_t src4; + src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, src4); +#endif // AOM_ARCH_AARCH64 + // Same for the filter vectors + const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32( + vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS)); + int32_t x_filter4_idx[4]; + vst1q_s32(x_filter4_idx, filter_idx_s32); + const int16_t *x_filter4_ptr[4]; + + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // We could easily do this using SIMD as well instead of calling the + // inline function 4 times. + x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[0]); + x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[1]); + x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[2]); + x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale X convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + vst1_u16(d, d0); + + x_qn += 4 * x_step_qn; + d += 4; + width -= 4; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } while (height > 0); + } +} + +static INLINE void highbd_convolve_2d_y_scale_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int subpel_y_qn, const int y_step_qn, + const InterpFilterParams *filter_params, const int round1_bits, + const int offset) { + const int32x4_t offset_s32 = vdupq_n_s32(1 << offset); + + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits); + if (w <= 4) { + int height = h; + uint16_t *d = dst_ptr; + int y_qn = subpel_y_qn; + + do { + const int16_t *s = + (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + const int16_t *y_filter_ptr = + av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16( + s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, + offset_s32, vdupq_n_s32(0)); + + if (w == 2) { + store_u16_2x1(d, d0); + } else { + vst1_u16(d, d0); + } + + y_qn += y_step_qn; + d += dst_stride; + height--; + } while (height > 0); + } else { + int width = w; + + do { + int height = h; + int y_qn = subpel_y_qn; + + uint16_t *d = dst_ptr; + + do { + const int16_t *s = + (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + const int16_t *y_filter_ptr = + av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16( + s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, + offset_s32, vdupq_n_s32(0)); + vst1q_u16(d, d0); + + y_qn += y_step_qn; + d += dst_stride; + height--; + } while (height > 0); + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width > 0); + } +} + +static INLINE void highbd_convolve_correct_offset_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int round_bits, const int offset, const int bd) { + const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits); + const int16x4_t offset_s16 = vdup_n_s16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w <= 4) { + for (int y = 0; y < h; ++y) { + const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride); + const int32x4_t d0 = + vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32); + uint16x4_t d = vqmovun_s32(d0); + d = vmin_u16(d, vget_low_u16(max)); + if (w == 2) { + store_u16_2x1(dst_ptr + y * dst_stride, d); + } else { + vst1_u16(dst_ptr + y * dst_stride, d); + } + } + } else { + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; x += 8) { + // Subtract round offset and convolve round + const int16x8_t s = + vld1q_s16((const int16_t *)src_ptr + y * src_stride + x); + const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16), + round_shift_s32); + const int32x4_t d1 = vqrshlq_s32( + vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32); + uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst_ptr + y * dst_stride + x, d01); + } + } + } +} + +void av1_highbd_convolve_2d_scale_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + uint16_t *im_block = (uint16_t *)aom_memalign( + 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP)); + if (!im_block) return; + uint16_t *im_block2 = (uint16_t *)aom_memalign( + 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP)); + if (!im_block2) { + aom_free(im_block); // free the first block and return. + return; + } + + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + const int im_stride = MAX_SB_SIZE; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + + const int vert_offset = filter_params_y->taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int x_offset_bits = (1 << (bd + FILTER_BITS - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int y_offset_correction = + ((1 << (y_offset_bits - conv_params->round_1)) + + (1 << (y_offset_bits - conv_params->round_1 - 1))); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + highbd_convolve_2d_x_scale_8tap_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn, + filter_params_x, conv_params, x_offset_bits); + if (conv_params->is_compound && !conv_params->do_average) { + highbd_convolve_2d_y_scale_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params->round_1, y_offset_bits); + } else { + highbd_convolve_2d_y_scale_8tap_neon( + im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params->round_1, y_offset_bits); + } + + // Do the compound averaging outside the loop, avoids branching within the + // main loop + if (conv_params->is_compound) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, + h, conv_params, bits, y_offset_correction, + bd); + } else { + highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, bits, y_offset_correction, bd); + } + } + } else { + highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride, + w, h, bits, y_offset_correction, bd); + } + aom_free(im_block); + aom_free(im_block2); +} diff --git a/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c b/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c new file mode 100644 index 0000000000..84bc8fd963 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c @@ -0,0 +1,5994 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you canzip + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#if AOM_ARCH_AARCH64 +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + int32x4x2_t swap_low = vtrnq_s32(x0, x1); \ + int32x4x2_t swap_high = vtrnq_s32(x2, x3); \ + y0 = vreinterpretq_s32_s64( \ + vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \ + vreinterpretq_s64_s32(swap_high.val[0]))); \ + y1 = vreinterpretq_s32_s64( \ + vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \ + vreinterpretq_s64_s32(swap_high.val[1]))); \ + y2 = vreinterpretq_s32_s64( \ + vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \ + vreinterpretq_s64_s32(swap_high.val[0]))); \ + y3 = vreinterpretq_s32_s64( \ + vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \ + vreinterpretq_s64_s32(swap_high.val[1]))); \ + } while (0) +#else +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + int32x4x2_t swap_low = vtrnq_s32(x0, x1); \ + int32x4x2_t swap_high = vtrnq_s32(x2, x3); \ + y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2), \ + swap_high.val[0], 2); \ + y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2), \ + swap_high.val[1], 2); \ + y2 = vextq_s32(swap_low.val[0], \ + vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \ + y3 = vextq_s32(swap_low.val[1], \ + vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \ + } while (0) +#endif // AOM_ARCH_AARCH64 + +static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) { + TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]); +} + +static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) { + TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); + TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); + TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); + TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], + out[15]); +} + +static INLINE void round_shift_array_32_neon(int32x4_t *input, + int32x4_t *output, const int size, + const int bit) { + const int32x4_t v_bit = vdupq_n_s32(-bit); + for (int i = 0; i < size; i++) { + output[i] = vrshlq_s32(input[i], v_bit); + } +} + +static INLINE void round_shift_rect_array_32_neon(int32x4_t *input, + int32x4_t *output, + const int size) { + for (int i = 0; i < size; i++) { + const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2); + output[i] = vrshrq_n_s32(r0, NewSqrt2Bits); + } +} + +static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0, + const int32_t *n1, const int32x4_t *w1, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, *n0); + x = vmlaq_n_s32(x, *w1, *n1); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_neon_mode11_r( + const int32_t *n0, const int32x4_t *w0, const int32_t *n1, + const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, -*n0); + x = vmlaq_n_s32(x, *w1, -*n1); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_neon_mode01_r( + const int32_t *n0, const int32x4_t *w0, const int32_t *n1, + const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, *n0); + x = vmlsq_n_s32(x, *w1, *n1); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_neon_mode10_r( + const int32_t *n0, const int32x4_t *w0, const int32_t *n1, + const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w1, *n1); + x = vmlsq_n_s32(x, *w0, *n0); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0, + const int32x4_t *w0, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, *n0); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0, + const int32x4_t *w0, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, -*n0); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit, + const int num_cols); + +typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit, + int32_t do_cols, int32_t bd, + int32_t out_shift); + +static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min, + const uint16x8_t *max) { + int16x8_t clamped; + clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max)); + clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min)); + return vreinterpretq_u16_s16(clamped); +} + +static INLINE void round_shift_4x4(int32x4_t *in, int shift) { + if (shift != 0) { + const int32x4_t v_shift = vdupq_n_s32(-shift); + in[0] = vrshlq_s32(in[0], v_shift); + in[1] = vrshlq_s32(in[1], v_shift); + in[2] = vrshlq_s32(in[2], v_shift); + in[3] = vrshlq_s32(in[3], v_shift); + } +} + +static void round_shift_8x8(int32x4_t *in, int shift) { + assert(shift != 0); + const int32x4_t v_shift = vdupq_n_s32(-shift); + in[0] = vrshlq_s32(in[0], v_shift); + in[1] = vrshlq_s32(in[1], v_shift); + in[2] = vrshlq_s32(in[2], v_shift); + in[3] = vrshlq_s32(in[3], v_shift); + in[4] = vrshlq_s32(in[4], v_shift); + in[5] = vrshlq_s32(in[5], v_shift); + in[6] = vrshlq_s32(in[6], v_shift); + in[7] = vrshlq_s32(in[7], v_shift); + in[8] = vrshlq_s32(in[8], v_shift); + in[9] = vrshlq_s32(in[9], v_shift); + in[10] = vrshlq_s32(in[10], v_shift); + in[11] = vrshlq_s32(in[11], v_shift); + in[12] = vrshlq_s32(in[12], v_shift); + in[13] = vrshlq_s32(in[13], v_shift); + in[14] = vrshlq_s32(in[14], v_shift); + in[15] = vrshlq_s32(in[15], v_shift); +} + +static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, int size) { + int32x4_t a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = vmaxq_s32(in[i], *clamp_lo); + out[i] = vminq_s32(a0, *clamp_hi); + + a1 = vmaxq_s32(in[i + 1], *clamp_lo); + out[i + 1] = vminq_s32(a1, *clamp_hi); + + a0 = vmaxq_s32(in[i + 2], *clamp_lo); + out[i + 2] = vminq_s32(a0, *clamp_hi); + + a1 = vmaxq_s32(in[i + 3], *clamp_lo); + out[i + 3] = vminq_s32(a1, *clamp_hi); + } +} + +static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred, + int32x4_t res0, + int32x4_t res1, + const int bd) { + const uint16x8_t v_zero = vdupq_n_u16(0); + int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero); + int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1); + uint16x8x2_t x; + x.val[0] = vreinterpretq_u16_s32( + vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred)))); + x.val[1] = vreinterpretq_u16_s32( + vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred)))); + x.val[0] = vreinterpretq_u16_s32( + vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val)); + x.val[0] = vreinterpretq_u16_s32( + vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val)); + x.val[1] = vreinterpretq_u16_s32( + vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val)); + x.val[1] = vreinterpretq_u16_s32( + vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val)); + uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])), + vqmovn_u32(vreinterpretq_u32_u16(x.val[1]))); + return res; +} + +static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred, + int32x4_t res0, + const int bd) { + uint16x4_t x0_ = vreinterpret_u16_s16( + vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred)))); + uint16x8_t x0 = vcombine_u16(x0_, x0_); + const uint16x8_t vmin = vdupq_n_u16(0); + const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); + x0 = highbd_clamp_u16(&x0, &vmin, &vmax); + return vget_low_u16(x0); +} + +static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + uint16x4_t v = vld1_u16(output + i * stride); + uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd); + + vst1_u16(output + i * stride, u); + } +} + +static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + uint16x8_t v = vld1q_u16(output + i * stride); + uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd); + + vst1q_u16(output + i * stride, u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + int32x4_t *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = vld1q_s32(in + i * stride); + } +} + +static INLINE void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) { + in[0] = vld1q_s32(coeff + 0); + in[1] = vld1q_s32(coeff + 4); + in[2] = vld1q_s32(coeff + 8); + in[3] = vld1q_s32(coeff + 12); +} + +static void addsub_neon(const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1, + const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) { + int32x4_t a0 = vaddq_s32(in0, in1); + int32x4_t a1 = vsubq_s32(in0, in1); + + a0 = vmaxq_s32(a0, *clamp_lo); + a0 = vminq_s32(a0, *clamp_hi); + a1 = vmaxq_s32(a1, *clamp_lo); + a1 = vminq_s32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_shift) { + int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift); + int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift); + + in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo); + in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi); + in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo); + in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi); + + *in0 = in0_w_offset; + *in1 = in1_w_offset; +} + +static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30], + v_bit, rnding); + bf1[30] = + half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding); + bf1[17] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29], + v_bit, rnding); + bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29], + v_bit, rnding); + bf1[18] = temp2; + + temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26], + v_bit, rnding); + bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit, + rnding); + bf1[21] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25], + v_bit, rnding); + bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25], + v_bit, rnding); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14], + v_bit, rnding); + bf1[14] = + half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding); + bf1[9] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13], + v_bit, rnding); + bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13], + v_bit, rnding); + bf1[10] = temp2; + + addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], + v_bit, rnding); + bf1[6] = + half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding); + bf1[5] = temp1; + + addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29], + v_bit, rnding); + bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit, + rnding); + bf1[18] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28], + v_bit, rnding); + bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit, + rnding); + bf1[19] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27], + v_bit, rnding); + bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27], + v_bit, rnding); + bf1[20] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26], + v_bit, rnding); + bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26], + v_bit, rnding); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], + v_bit, rnding); + bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit, + rnding); + bf1[10] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], + v_bit, rnding); + bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit, + rnding); + bf1[11] = temp2; + + addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], + v_bit, rnding); + bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit, + rnding); + bf1[20] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], + v_bit, rnding); + bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit, + rnding); + bf1[21] = temp2; + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], + v_bit, rnding); + bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit, + rnding); + bf1[22] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], + v_bit, rnding); + bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit, + rnding); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out, + const int do_cols, const int bd, + const int out_shift, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi) { + addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 32; i += 8) { + round_shift_4x4(out + i, out_shift); + round_shift_4x4(out + i + 4, out_shift); + } + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1, + int32x4_t *out0, int32x4_t *out1, + const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, + const int32x4_t *v_shift, int32x4_t *offset) { + int32x4_t a0 = vaddq_s32(*offset, *in0); + int32x4_t a1 = vsubq_s32(*offset, *in1); + + a0 = vshlq_s32(a0, *v_shift); + a1 = vshlq_s32(a1, *v_shift); + + a0 = vmaxq_s32(a0, *clamp_lo); + a0 = vminq_s32(a0, *clamp_hi); + a1 = vmaxq_s32(a1, *clamp_lo); + a1 = vminq_s32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + int32x4_t u0, u1, u2, u3; + int32x4_t v0, v1, v2, v3, x, y; + + // Stage 0-1-2 + + u0 = in[0]; + u1 = in[1]; + u2 = in[2]; + u3 = in[3]; + + const int32x4_t v_bit = vdupq_n_s32(-bit); + + x = vmlaq_n_s32(rnding, u0, cospi[32]); + y = vmulq_n_s32(u2, cospi[32]); + v0 = vaddq_s32(x, y); + v0 = vshlq_s32(v0, v_bit); + + v1 = vsubq_s32(x, y); + v1 = vshlq_s32(v1, v_bit); + + x = vmlaq_n_s32(rnding, u1, cospi[48]); + v2 = vmlsq_n_s32(x, u3, cospi[16]); + v2 = vshlq_s32(v2, v_bit); + + x = vmlaq_n_s32(rnding, u1, cospi[16]); + v3 = vmlaq_n_s32(x, u3, cospi[48]); + v3 = vshlq_s32(v3, v_bit); + // Stage 3 + addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); + addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); + + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift); + shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift); + } +} + +static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *sinpi = sinpi_arr(bit); + const int32x4_t zero = vdupq_n_s32(0); + int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1)); + const int32x2_t mul = vdup_n_s32(1 << 4); + int32x4_t t; + int32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + int32x4_t x0, x1, x2, x3; + int32x4_t u0, u1, u2, u3; + + x0 = in[0]; + x1 = in[1]; + x2 = in[2]; + x3 = in[3]; + + s0 = vmulq_n_s32(x0, sinpi[1]); + s1 = vmulq_n_s32(x0, sinpi[2]); + s2 = vmulq_n_s32(x1, sinpi[3]); + s3 = vmulq_n_s32(x2, sinpi[4]); + s4 = vmulq_n_s32(x2, sinpi[1]); + s5 = vmulq_n_s32(x3, sinpi[2]); + s6 = vmulq_n_s32(x3, sinpi[4]); + t = vsubq_s32(x0, x2); + s7 = vaddq_s32(t, x3); + + t = vaddq_s32(s0, s3); + s0 = vaddq_s32(t, s5); + t = vsubq_s32(s1, s4); + s1 = vsubq_s32(t, s6); + s3 = s2; + s2 = vmulq_n_s32(s7, sinpi[3]); + + u0 = vaddq_s32(s0, s3); + u1 = vaddq_s32(s1, s3); + u2 = s2; + t = vaddq_s32(s0, s1); + u3 = vsubq_s32(t, s3); + + // u0 + int32x4x2_t u0x; + u0x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul)); + u0x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding)); + + u0 = vextq_s32(u0, zero, 1); + u0x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul)); + u0x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding)); + + u0x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u0x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u0x = vzipq_s32(u0x.val[0], u0x.val[1]); +#if AOM_ARCH_AARCH64 + u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]), + vreinterpretq_s64_s32(u0x.val[1]))); +#else + u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1])); +#endif // AOM_ARCH_AARCH64 + // u1 + int32x4x2_t u1x; + u1x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul)); + u1x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding)); + + u1 = vextq_s32(u1, zero, 1); + u1x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul)); + u1x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding)); + + u1x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u1x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u1x = vzipq_s32(u1x.val[0], u1x.val[1]); +#if AOM_ARCH_AARCH64 + u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]), + vreinterpretq_s64_s32(u1x.val[1]))); +#else + u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1])); +#endif // AOM_ARCH_AARCH64 + + // u2 + int32x4x2_t u2x; + u2x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul)); + u2x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding)); + + u2 = vextq_s32(u2, zero, 1); + u2x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul)); + u2x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding)); + + u2x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u2x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u2x = vzipq_s32(u2x.val[0], u2x.val[1]); +#if AOM_ARCH_AARCH64 + u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]), + vreinterpretq_s64_s32(u2x.val[1]))); +#else + u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1])); +#endif // AOM_ARCH_AARCH64 + + // u3 + int32x4x2_t u3x; + u3x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul)); + u3x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding)); + + u3 = vextq_s32(u3, zero, 1); + u3x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul)); + u3x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding)); + + u3x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u3x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u3x = vzipq_s32(u3x.val[0], u3x.val[1]); +#if AOM_ARCH_AARCH64 + u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]), + vreinterpretq_s64_s32(u3x.val[1]))); +#else + u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1])); +#endif // AOM_ARCH_AARCH64 + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + uint32x4_t u0, u1, u2, u3; + uint16x4_t v0, v1, v2, v3; + round_shift_4x4(in, shift); + + v0 = vld1_u16(output + 0 * stride); + v1 = vld1_u16(output + 1 * stride); + v2 = vld1_u16(output + 2 * stride); + v3 = vld1_u16(output + 3 * stride); + + if (fliplr) { + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0])); + in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1])); + in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2])); + in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3])); + in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + } + + if (flipud) { + u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0); + u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1); + u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2); + u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3); + } else { + u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0); + u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1); + u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2); + u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3); + } + + uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1)); + uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3)); + const uint16x8_t vmin = vdupq_n_u16(0); + const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); + u4 = highbd_clamp_u16(&u4, &vmin, &vmax); + u5 = highbd_clamp_u16(&u5, &vmin, &vmax); + + vst1_u16(output + 0 * stride, vget_low_u16(u4)); + vst1_u16(output + 1 * stride, vget_high_u16(u4)); + vst1_u16(output + 2 * stride, vget_low_u16(u5)); + vst1_u16(output + 3 * stride, vget_high_u16(u5)); +} + +static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + int32x4_t zero = vdupq_n_s32(0); + int32x2_t fact = vdup_n_s32(NewSqrt2); + int32x4x2_t a0; + const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1)); + + for (int i = 0; i < 4; i++) { + a0.val[0] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact)); + a0.val[0] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits)); + a0.val[1] = vextq_s32(in[i], zero, 1); + a0.val[1] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact)); + a0.val[1] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits)); + + a0 = vzipq_s32(a0.val[0], a0.val[1]); +#if AOM_ARCH_AARCH64 + out[i] = vreinterpretq_s32_s64(vzip1q_s64( + vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1]))); +#else + out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2); +#endif + } + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + int32x4_t in[4]; + + const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case IDTX: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case H_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + default: assert(0); + } +} + +// 8x8 +static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) { + in[0] = vld1q_s32(coeff + 0); + in[1] = vld1q_s32(coeff + 4); + in[2] = vld1q_s32(coeff + 8); + in[3] = vld1q_s32(coeff + 12); + in[4] = vld1q_s32(coeff + 16); + in[5] = vld1q_s32(coeff + 20); + in[6] = vld1q_s32(coeff + 24); + in[7] = vld1q_s32(coeff + 28); + in[8] = vld1q_s32(coeff + 32); + in[9] = vld1q_s32(coeff + 36); + in[10] = vld1q_s32(coeff + 40); + in[11] = vld1q_s32(coeff + 44); + in[12] = vld1q_s32(coeff + 48); + in[13] = vld1q_s32(coeff + 52); + in[14] = vld1q_s32(coeff + 56); + in[15] = vld1q_s32(coeff + 60); +} + +static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; + int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; + int32x4_t x, y; + int col; + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + const int32x4_t v_bit = vdupq_n_s32(-bit); + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + // stage 2 + u0 = in[0 * 2 + col]; + u1 = in[4 * 2 + col]; + u2 = in[2 * 2 + col]; + u3 = in[6 * 2 + col]; + + x = vmulq_n_s32(in[1 * 2 + col], cospi[56]); + u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]); + u4 = vaddq_s32(u4, rnding); + u4 = vshlq_s32(u4, v_bit); + + x = vmulq_n_s32(in[1 * 2 + col], cospi[8]); + u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]); + u7 = vaddq_s32(u7, rnding); + u7 = vshlq_s32(u7, v_bit); + + x = vmulq_n_s32(in[5 * 2 + col], cospi[24]); + u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]); + u5 = vaddq_s32(u5, rnding); + u5 = vshlq_s32(u5, v_bit); + + x = vmulq_n_s32(in[5 * 2 + col], cospi[40]); + u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]); + u6 = vaddq_s32(u6, rnding); + u6 = vshlq_s32(u6, v_bit); + + // stage 3 + x = vmulq_n_s32(u0, cospi[32]); + y = vmulq_n_s32(u1, cospi[32]); + v0 = vaddq_s32(x, y); + v0 = vaddq_s32(v0, rnding); + v0 = vshlq_s32(v0, v_bit); + + v1 = vsubq_s32(x, y); + v1 = vaddq_s32(v1, rnding); + v1 = vshlq_s32(v1, v_bit); + + x = vmulq_n_s32(u2, cospi[48]); + v2 = vmlaq_n_s32(x, u3, -cospi[16]); + v2 = vaddq_s32(v2, rnding); + v2 = vshlq_s32(v2, v_bit); + + x = vmulq_n_s32(u2, cospi[16]); + v3 = vmlaq_n_s32(x, u3, cospi[48]); + v3 = vaddq_s32(v3, rnding); + v3 = vshlq_s32(v3, v_bit); + + addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = vmulq_n_s32(v5, cospi[32]); + y = vmulq_n_s32(v6, cospi[32]); + u6 = vaddq_s32(y, x); + u6 = vaddq_s32(u6, rnding); + u6 = vshlq_s32(u6, v_bit); + + u5 = vsubq_s32(y, x); + u5 = vaddq_s32(u5, rnding); + u5 = vshlq_s32(u5, v_bit); + + // stage 5 + addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int32x4_t kZero = vdupq_n_s32(0); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u[8], v[8], x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-1-2 + // (1) + u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]); + u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]); + u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]); + u[1] = vshlq_s32(u[1], v_bit); + + // (2) + u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]); + u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]); + u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]); + u[3] = vshlq_s32(u[3], v_bit); + + // (3) + u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]); + u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]); + u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]); + u[5] = vshlq_s32(u[5], v_bit); + + // (4) + u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]); + u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]); + u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 3 + addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); + u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]); + u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]); + u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 5 + addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + u[2] = vaddq_s32(v[0], x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(v[0], x); + u[3] = vshlq_s32(u[3], v_bit); + + v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + u[6] = vaddq_s32(v[0], x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(v[0], x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[2] = vsubq_s32(kZero, u[4]); + out[4] = u[6]; + out[6] = vsubq_s32(kZero, u[2]); + out[8] = u[3]; + out[10] = vsubq_s32(kZero, u[7]); + out[12] = u[5]; + out[14] = vsubq_s32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]); + u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]); + u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]); + u[1] = vshlq_s32(u[1], v_bit); + + // (2) + u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]); + u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]); + u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]); + u[3] = vshlq_s32(u[3], v_bit); + + // (3) + u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]); + u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]); + u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]); + u[5] = vshlq_s32(u[5], v_bit); + + // (4) + u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]); + u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]); + u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 3 + addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); + u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]); + u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]); + u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 5 + addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + u[2] = vaddq_s32(v[0], x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(v[0], x); + u[3] = vshlq_s32(u[3], v_bit); + + v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + u[6] = vaddq_s32(v[0], x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(v[0], x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[1] = u[0]; + out[3] = vsubq_s32(kZero, u[4]); + out[5] = u[6]; + out[7] = vsubq_s32(kZero, u[2]); + out[9] = u[3]; + out[11] = vsubq_s32(kZero, u[7]); + out[13] = u[5]; + out[15] = vsubq_s32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + out[0] = vaddq_s32(in[0], in[0]); + out[1] = vaddq_s32(in[1], in[1]); + out[2] = vaddq_s32(in[2], in[2]); + out[3] = vaddq_s32(in[3], in[3]); + out[4] = vaddq_s32(in[4], in[4]); + out[5] = vaddq_s32(in[5], in[5]); + out[6] = vaddq_s32(in[6], in[6]); + out[7] = vaddq_s32(in[7], in[7]); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8); + } +} + +static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo, + int32x4_t res_hi, int fliplr, int bd) { + uint16x8x2_t x; + + if (fliplr) { + res_lo = vrev64q_s32(res_lo); + res_lo = vextq_s32(res_lo, res_lo, 2); + res_hi = vrev64q_s32(res_hi); + res_hi = vextq_s32(res_hi, res_hi, 2); + x.val[0] = vreinterpretq_u16_s32( + vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred)))); + x.val[1] = vreinterpretq_u16_s32( + vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred)))); + + } else { + x.val[0] = vreinterpretq_u16_s32( + vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred)))); + x.val[1] = vreinterpretq_u16_s32( + vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred)))); + } + + uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])), + vqmovn_u32(vreinterpretq_u32_u16(x.val[1]))); + const uint16x8_t vmin = vdupq_n_u16(0); + const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); + return highbd_clamp_u16(&x2, &vmin, &vmax); +} + +static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7; + uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7; + round_shift_8x8(in, shift); + + v0 = vld1q_u16(output + 0 * stride); + v1 = vld1q_u16(output + 1 * stride); + v2 = vld1q_u16(output + 2 * stride); + v3 = vld1q_u16(output + 3 * stride); + v4 = vld1q_u16(output + 4 * stride); + v5 = vld1q_u16(output + 5 * stride); + v6 = vld1q_u16(output + 6 * stride); + v7 = vld1q_u16(output + 7 * stride); + + if (flipud) { + u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); + u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); + u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); + u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); + u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); + u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); + u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); + u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); + } else { + u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); + u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); + u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); + u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); + u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); + u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); + u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); + u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); + } + + vst1q_u16(output + 0 * stride, u0); + vst1q_u16(output + 1 * stride, u1); + vst1q_u16(output + 2 * stride, u2); + vst1q_u16(output + 3 * stride, u3); + vst1q_u16(output + 4 * stride, u4); + vst1q_u16(output + 5 * stride, u5); + vst1q_u16(output + 6 * stride, u6); + vst1q_u16(output + 7 * stride, u7); +} + +void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + int32x4_t in[16], out[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in); + idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_8x8(input, in); + idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in); + idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + default: assert(0); + } +} + +static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-1-2-3 + x = vmulq_n_s32(in[0], cospi[32]); + x = vaddq_s32(vshlq_s32(x, v_bit), rnding); + + // stage 4-5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + x = vaddq_s32(x, offset); + x = vshlq_s32(x, vdupq_n_s32(-out_shift)); + } + + x = vmaxq_s32(x, clamp_lo); + x = vminq_s32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} + +static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; + int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; + int32x4_t x, y; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = vmlaq_n_s32(rnding, in[1], cospi[56]); + u4 = vmlaq_n_s32(x, in[7], -cospi[8]); + u4 = vshlq_s32(u4, v_bit); + + x = vmlaq_n_s32(rnding, in[1], cospi[8]); + u7 = vmlaq_n_s32(x, in[7], cospi[56]); + u7 = vshlq_s32(u7, v_bit); + + x = vmlaq_n_s32(rnding, in[5], cospi[24]); + u5 = vmlaq_n_s32(x, in[3], -cospi[40]); + u5 = vshlq_s32(u5, v_bit); + + x = vmlaq_n_s32(rnding, in[5], cospi[40]); + u6 = vmlaq_n_s32(x, in[3], cospi[24]); + u6 = vshlq_s32(u6, v_bit); + + // stage 3 + x = vmlaq_n_s32(rnding, u0, cospi[32]); + y = vmulq_n_s32(u1, cospi[32]); + v0 = vaddq_s32(x, y); + v0 = vshlq_s32(v0, v_bit); + + v1 = vsubq_s32(x, y); + v1 = vshlq_s32(v1, v_bit); + + x = vmlaq_n_s32(rnding, u2, cospi[48]); + v2 = vmlaq_n_s32(x, u3, -cospi[16]); + v2 = vshlq_s32(v2, v_bit); + + x = vmlaq_n_s32(rnding, u2, cospi[16]); + v3 = vmlaq_n_s32(x, u3, cospi[48]); + v3 = vshlq_s32(v3, v_bit); + + addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = vmulq_n_s32(v5, cospi[32]); + y = vmlaq_n_s32(rnding, v6, cospi[32]); + u6 = vaddq_s32(y, x); + u6 = vshlq_s32(u6, v_bit); + + u5 = vsubq_s32(y, x); + u5 = vshlq_s32(u5, v_bit); + + // stage 5 + addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} + +static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int32x4_t u[8], x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-2 + + u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]); + u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit); + + // stage 3-4 + int32x4_t temp1, temp2; + temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]); + temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]); + temp1 = vshlq_s32(temp1, v_bit); + u[4] = temp1; + + temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]); + u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + // stage 5-6 + temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]); + x = vmulq_n_s32(u[1], cospi[32]); + u[2] = vaddq_s32(temp1, x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(temp1, x); + u[3] = vshlq_s32(u[3], v_bit); + + temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]); + x = vmulq_n_s32(u[5], cospi[32]); + u[6] = vaddq_s32(temp1, x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(temp1, x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = vnegq_s32(u[4]); + out[2] = u[6]; + out[3] = vnegq_s32(u[2]); + out[4] = u[3]; + out[5] = vnegq_s32(u[7]); + out[6] = u[5]; + out[7] = vnegq_s32(u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + } +} + +static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u[8], v[8], x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-2 + + u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]); + u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]); + u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]); + u[1] = vshlq_s32(u[1], v_bit); + + // (2) + u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]); + u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]); + u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]); + u[3] = vshlq_s32(u[3], v_bit); + + // (3) + u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]); + u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]); + u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]); + u[5] = vshlq_s32(u[5], v_bit); + + // (4) + u[6] = vmulq_n_s32(in[1], cospi[52]); + u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]); + u[6] = vaddq_s32(u[6], rnding); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmulq_n_s32(in[1], cospi[12]); + u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]); + u[7] = vaddq_s32(u[7], rnding); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 3 + addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); + u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]); + u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]); + u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 5 + addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + u[2] = vaddq_s32(v[0], x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(v[0], x); + u[3] = vshlq_s32(u[3], v_bit); + + v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + u[6] = vaddq_s32(v[0], x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(v[0], x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = vnegq_s32(u[4]); + out[2] = u[6]; + out[3] = vnegq_s32(u[2]); + out[4] = u[3]; + out[5] = vnegq_s32(u[7]); + out[6] = u[5]; + out[7] = vnegq_s32(u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + } +} + +static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-4 + in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]); + in[0] = vshlq_s32(in[0], v_bit); + + // stage 5-7 + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + if (out_shift != 0) { + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + in[0] = vaddq_s32(in[0], offset); + in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift)); + } + } + + in[0] = vmaxq_s32(in[0], clamp_lo); + in[0] = vminq_s32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; +} + +static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + int32x4_t u[16], x, y; + // stage 0-1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + + u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding); + u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding); + + u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding); + u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding); + + u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); + u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); + + // stage 3 + u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding); + u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding); + u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding); + u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding); + + addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = vmlaq_n_s32(rnding, u[0], cospi[32]); + u[0] = vshlq_s32(x, v_bit); + u[1] = u[0]; + + u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding); + u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding); + + addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + u[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + u[9] = x; + y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit, + &rnding); + u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit, + &rnding); + u[10] = y; + + // stage 5 + addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = vmulq_n_s32(u[5], cospi[32]); + y = vmlaq_n_s32(rnding, u[6], cospi[32]); + u[5] = vsubq_s32(y, x); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vaddq_s32(y, x); + u[6] = vshlq_s32(u[6], v_bit); + + addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = vmulq_n_s32(u[10], cospi[32]); + y = vmlaq_n_s32(rnding, u[13], cospi[32]); + u[10] = vsubq_s32(y, x); + u[10] = vshlq_s32(u[10], v_bit); + + u[13] = vaddq_s32(x, y); + u[13] = vshlq_s32(u[13], v_bit); + + x = vmulq_n_s32(u[11], cospi[32]); + y = vmlaq_n_s32(rnding, u[12], cospi[32]); + u[11] = vsubq_s32(y, x); + u[11] = vshlq_s32(u[11], v_bit); + + u[12] = vaddq_s32(x, y); + u[12] = vshlq_s32(u[12], v_bit); + // stage 7 + addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int32x4_t v[16], x, y, temp1, temp2; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0 + // stage 1 + // stage 2 + v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]); + v[0] = vshlq_s32(v[0], v_bit); + + v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]); + v[1] = vshlq_s32(v[1], v_bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]); + temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]); + temp1 = vshlq_s32(temp1, v_bit); + + temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]); + temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]); + temp2 = vshlq_s32(temp2, v_bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]); + temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]); + temp1 = vshlq_s32(temp1, v_bit); + + temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]); + temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]); + temp2 = vshlq_s32(temp2, v_bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]); + temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]); + temp1 = vshlq_s32(temp1, v_bit); + + temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]); + temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]); + temp2 = vshlq_s32(temp2, v_bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + v[2] = vaddq_s32(y, x); + v[2] = vshlq_s32(v[2], v_bit); + + v[3] = vsubq_s32(y, x); + v[3] = vshlq_s32(v[3], v_bit); + + y = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + v[6] = vaddq_s32(y, x); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vsubq_s32(y, x); + v[7] = vshlq_s32(v[7], v_bit); + + y = vmlaq_n_s32(rnding, v[10], cospi[32]); + x = vmulq_n_s32(v[11], cospi[32]); + v[10] = vaddq_s32(y, x); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vsubq_s32(y, x); + v[11] = vshlq_s32(v[11], v_bit); + + y = vmlaq_n_s32(rnding, v[14], cospi[32]); + x = vmulq_n_s32(v[15], cospi[32]); + v[14] = vaddq_s32(y, x); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vsubq_s32(y, x); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = vnegq_s32(v[8]); + out[2] = v[12]; + out[3] = vnegq_s32(v[4]); + out[4] = v[6]; + out[5] = vnegq_s32(v[14]); + out[6] = v[10]; + out[7] = vnegq_s32(v[2]); + out[8] = v[3]; + out[9] = vnegq_s32(v[11]); + out[10] = v[15]; + out[11] = vnegq_s32(v[7]); + out[12] = v[5]; + out[13] = vnegq_s32(v[13]); + out[14] = v[9]; + out[15] = vnegq_s32(v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t zero = vdupq_n_s32(0); + int32x4_t u[16], x, y; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-2 + u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]); + u[1] = vshlq_s32(u[1], v_bit); + + u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]); + u[3] = vshlq_s32(u[3], v_bit); + + u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]); + u[7] = vshlq_s32(u[7], v_bit); + + u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]); + u[8] = vshlq_s32(u[8], v_bit); + + u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]); + u[9] = vshlq_s32(u[9], v_bit); + + u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]); + u[10] = vshlq_s32(u[10], v_bit); + + u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]); + u[11] = vshlq_s32(u[11], v_bit); + + u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]); + u[12] = vshlq_s32(u[12], v_bit); + + u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]); + u[13] = vshlq_s32(u[13], v_bit); + + u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 3 + addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = vmlaq_n_s32(rnding, u[8], cospi[56]); + u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]); + u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]); + u[8] = vshlq_s32(u[8], v_bit); + + u[9] = vmlsq_n_s32(y, u[9], cospi[8]); + u[9] = vshlq_s32(u[9], v_bit); + + y = vmlaq_n_s32(rnding, u[10], cospi[24]); + u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]); + u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]); + u[10] = vshlq_s32(u[10], v_bit); + + u[11] = vmlsq_n_s32(y, u[11], cospi[40]); + u[11] = vshlq_s32(u[11], v_bit); + + y = vmlaq_n_s32(rnding, u[12], cospi[8]); + u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]); + u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]); + u[12] = vshlq_s32(u[12], v_bit); + + u[13] = vmlaq_n_s32(y, u[13], cospi[56]); + u[13] = vshlq_s32(u[13], v_bit); + + y = vmlaq_n_s32(rnding, u[14], cospi[40]); + u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]); + u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vmlaq_n_s32(y, u[15], cospi[24]); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 5 + addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + y = vmlaq_n_s32(rnding, u[4], cospi[48]); + u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlsq_n_s32(y, u[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + y = vmlaq_n_s32(rnding, u[6], cospi[16]); + u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]); + u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(y, u[7], cospi[48]); + u[7] = vshlq_s32(u[7], v_bit); + + y = vmlaq_n_s32(rnding, u[12], cospi[48]); + u[12] = vmulq_n_s32(u[12], cospi[16]); + u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]); + u[12] = vshlq_s32(u[12], v_bit); + + u[13] = vmlsq_n_s32(y, u[13], cospi[16]); + u[13] = vshlq_s32(u[13], v_bit); + + y = vmlaq_n_s32(rnding, u[14], cospi[16]); + u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]); + u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vmlaq_n_s32(y, u[15], cospi[48]); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 7 + addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = vmlaq_n_s32(rnding, u[2], cospi[32]); + x = vmulq_n_s32(u[3], cospi[32]); + u[2] = vaddq_s32(y, x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(y, x); + u[3] = vshlq_s32(u[3], v_bit); + y = vmlaq_n_s32(rnding, u[6], cospi[32]); + x = vmulq_n_s32(u[7], cospi[32]); + u[6] = vaddq_s32(y, x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(y, x); + u[7] = vshlq_s32(u[7], v_bit); + + y = vmlaq_n_s32(rnding, u[10], cospi[32]); + x = vmulq_n_s32(u[11], cospi[32]); + u[10] = vaddq_s32(y, x); + u[10] = vshlq_s32(u[10], v_bit); + + u[11] = vsubq_s32(y, x); + u[11] = vshlq_s32(u[11], v_bit); + + y = vmlaq_n_s32(rnding, u[14], cospi[32]); + x = vmulq_n_s32(u[15], cospi[32]); + u[14] = vaddq_s32(y, x); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vsubq_s32(y, x); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = vsubq_s32(zero, u[8]); + out[2] = u[12]; + out[3] = vsubq_s32(zero, u[4]); + out[4] = u[6]; + out[5] = vsubq_s32(zero, u[14]); + out[6] = u[10]; + out[7] = vsubq_s32(zero, u[2]); + out[8] = u[3]; + out[9] = vsubq_s32(zero, u[11]); + out[10] = u[15]; + out[11] = vsubq_s32(zero, u[7]); + out[12] = u[5]; + out[13] = vsubq_s32(zero, u[13]); + out[14] = u[9]; + out[15] = vsubq_s32(zero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u[16], v[16], x, y; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + { + // stage 0-1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit, + &rnding); + v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit, + &rnding); + v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13], + &v_bit, &rnding); + v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12], + &v_bit, &rnding); + v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit, + &rnding); + v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit, + &rnding); + v[14] = + half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding); + v[15] = + half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit, + &rnding); + u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit, + &rnding); + u[6] = + half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding); + u[7] = + half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding); + addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = vmlaq_n_s32(rnding, u[0], cospi[32]); + y = vmulq_n_s32(u[1], cospi[32]); + v[0] = vaddq_s32(x, y); + v[0] = vshlq_s32(v[0], v_bit); + + v[1] = vsubq_s32(x, y); + v[1] = vshlq_s32(v[1], v_bit); + + v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit, + &rnding); + v[3] = + half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding); + addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], + &v_bit, &rnding); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], + &v_bit, &rnding); + v[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + v[15] = u[15]; + + // stage 5 + addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = vmulq_n_s32(v[5], cospi[32]); + y = vmlaq_n_s32(rnding, v[6], cospi[32]); + u[5] = vsubq_s32(y, x); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vaddq_s32(y, x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = v[7]; + addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = vmulq_n_s32(u[10], cospi[32]); + y = vmlaq_n_s32(rnding, u[13], cospi[32]); + v[10] = vsubq_s32(y, x); + v[10] = vshlq_s32(v[10], v_bit); + + v[13] = vaddq_s32(x, y); + v[13] = vshlq_s32(v[13], v_bit); + + x = vmulq_n_s32(u[11], cospi[32]); + y = vmlaq_n_s32(rnding, u[12], cospi[32]); + v[11] = vsubq_s32(y, x); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = vaddq_s32(x, y); + v[12] = vshlq_s32(v[12], v_bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = + vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t zero = vdupq_n_s32(0); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + int32x4_t u[16], v[16], x, y; + // Calculate the column 0, 1, 2, 3 + // stage 0 + // stage 1 + // stage 2 + v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]); + v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]); + v[0] = vshlq_s32(v[0], v_bit); + + v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]); + v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]); + v[1] = vshlq_s32(v[1], v_bit); + + v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]); + v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]); + v[2] = vshlq_s32(v[2], v_bit); + + v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]); + v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]); + v[3] = vshlq_s32(v[3], v_bit); + + v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]); + v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]); + v[4] = vshlq_s32(v[4], v_bit); + + v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]); + v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]); + v[5] = vshlq_s32(v[5], v_bit); + + v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]); + v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]); + v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]); + v[7] = vshlq_s32(v[7], v_bit); + + v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]); + v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]); + v[8] = vshlq_s32(v[8], v_bit); + + v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]); + v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]); + v[9] = vshlq_s32(v[9], v_bit); + + v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]); + v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]); + v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]); + v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]); + v[12] = vshlq_s32(v[12], v_bit); + + v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]); + v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]); + v[13] = vshlq_s32(v[13], v_bit); + + v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]); + v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]); + v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 3 + addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]); + v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]); + v[8] = vshlq_s32(v[8], v_bit); + + v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]); + v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]); + v[9] = vshlq_s32(v[9], v_bit); + + v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]); + v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]); + v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]); + v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]); + v[12] = vshlq_s32(v[12], v_bit); + + v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]); + v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]); + v[13] = vshlq_s32(v[13], v_bit); + + v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]); + v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]); + v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 5 + addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]); + v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]); + v[4] = vshlq_s32(v[4], v_bit); + + v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]); + v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]); + v[5] = vshlq_s32(v[5], v_bit); + + v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]); + v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]); + v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]); + v[7] = vshlq_s32(v[7], v_bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]); + v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]); + v[12] = vshlq_s32(v[12], v_bit); + + v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]); + v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]); + v[13] = vshlq_s32(v[13], v_bit); + + v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]); + v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]); + v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 7 + addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = vmlaq_n_s32(rnding, u[2], cospi[32]); + x = vmulq_n_s32(u[3], cospi[32]); + v[2] = vaddq_s32(y, x); + v[2] = vshlq_s32(v[2], v_bit); + + v[3] = vsubq_s32(y, x); + v[3] = vshlq_s32(v[3], v_bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = vmlaq_n_s32(rnding, u[6], cospi[32]); + x = vmulq_n_s32(u[7], cospi[32]); + v[6] = vaddq_s32(y, x); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vsubq_s32(y, x); + v[7] = vshlq_s32(v[7], v_bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = vmlaq_n_s32(rnding, u[10], cospi[32]); + x = vmulq_n_s32(u[11], cospi[32]); + v[10] = vaddq_s32(y, x); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vsubq_s32(y, x); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = vmlaq_n_s32(rnding, u[14], cospi[32]); + x = vmulq_n_s32(u[15], cospi[32]); + v[14] = vaddq_s32(y, x); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vsubq_s32(y, x); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = vsubq_s32(zero, v[8]); + out[2] = v[12]; + out[3] = vsubq_s32(zero, v[4]); + out[4] = v[6]; + out[5] = vsubq_s32(zero, v[14]); + out[6] = v[10]; + out[7] = vsubq_s32(zero, v[2]); + out[8] = v[3]; + out[9] = vsubq_s32(zero, v[11]); + out[10] = v[15]; + out[11] = vsubq_s32(zero, v[7]); + out[12] = v[5]; + out[13] = vsubq_s32(zero, v[13]); + out[14] = v[9]; + out[15] = vsubq_s32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + (void)bit; + int32x2_t fact = vdup_n_s32(2 * NewSqrt2); + int32x4x2_t a0; + int32x4_t zero = vdupq_n_s32(0); + const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1)); + for (int i = 0; i < 16; i++) { + a0.val[0] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact)); + a0.val[0] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits)); + a0.val[1] = vextq_s32(in[i], zero, 1); + a0.val[1] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact)); + a0.val[1] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits)); + a0 = vzipq_s32(a0.val[0], a0.val[1]); +#if AOM_ARCH_AARCH64 + out[i] = vreinterpretq_s32_s64(vzip1q_s64( + vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1]))); +#else + out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2); +#endif + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16); + } +} + +static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int i; + int32x4_t temp1, temp2, temp3, temp4; + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, + rnding); + u[13] = + half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding); + u[10] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, + rnding); + u[12] = + half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); + } + + temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit, + rnding); + u[56] = + half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding); + u[57] = + half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding); + u[58] = + half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding); + u[59] = + half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit, + rnding); + temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit, + rnding); + temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit, + rnding); + temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit, + rnding); + u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit, + rnding); + u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit, + rnding); + u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit, + rnding); + u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit, + rnding); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int i; + int32x4_t temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, + rnding); + u[24] = + half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding); + u[25] = + half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding); + u[26] = + half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding); + u[27] = + half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, + rnding); + u[52] = + half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding); + u[53] = + half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding); + u[54] = + half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding); + u[55] = + half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, + rnding); + u[48] = + half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding); + u[49] = + half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding); + u[50] = + half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding); + u[51] = + half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_neon(int32x4_t *u, int32x4_t *out, + int do_cols, int bd, int out_shift, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4); + } + } +} + +static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + { + int32x4_t x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + x = vaddq_s32(x, offset); + x = vshlq_s32(x, vdupq_n_s32(-out_shift)); + } + } + x = vmaxq_s32(x, clamp_lo); + x = vminq_s32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} + +static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + { + int32x4_t u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); + u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); + u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); + u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); + u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); + u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); + u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); + u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); + + // stage 3 + u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding); + u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding); + u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding); + u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + int32x4_t temp1, temp2; + u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], + &v_bit, &rnding); + u[62] = + half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); + u[33] = temp1; + + temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], + &v_bit, &rnding); + u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], + &v_bit, &rnding); + u[57] = temp2; + + temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], + &v_bit, &rnding); + u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, + &rnding); + u[41] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], + &v_bit, &rnding); + u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], + &v_bit, &rnding); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], + &v_bit, &rnding); + u[30] = + half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding); + u[17] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25], + &v_bit, &rnding); + u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25], + &v_bit, &rnding); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[0] = temp1; + + temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], + &v_bit, &rnding); + u[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], + &v_bit, &rnding); + u[61] = + half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); + u[34] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], + &v_bit, &rnding); + u[60] = + half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); + u[35] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], + &v_bit, &rnding); + u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], + &v_bit, &rnding); + u[36] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], + &v_bit, &rnding); + u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], + &v_bit, &rnding); + u[37] = temp2; + temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], + &v_bit, &rnding); + u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, + &rnding); + u[42] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], + &v_bit, &rnding); + u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, + &rnding); + u[43] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], + &v_bit, &rnding); + u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], + &v_bit, &rnding); + u[44] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], + &v_bit, &rnding); + u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], + &v_bit, &rnding); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29], + &v_bit, &rnding); + u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit, + &rnding); + u[18] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28], + &v_bit, &rnding); + u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit, + &rnding); + u[19] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27], + &v_bit, &rnding); + u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27], + &v_bit, &rnding); + u[20] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26], + &v_bit, &rnding); + u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26], + &v_bit, &rnding); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + u[9] = u[9]; + + idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 9 + idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 10 + idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 11 + idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + { + int32x4_t u[64]; + int32x4_t tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); + u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); + u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding); + u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding); + u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding); + u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding); + u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); + u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); + u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); + u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); + u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding); + u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding); + u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); + u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); + u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding); + u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding); + + // stage 3 + u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding); + u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding); + u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding); + u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding); + u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding); + u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding); + u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding); + u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); + u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit, + &rnding); + tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit, + &rnding); + tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], + &v_bit, &rnding); + u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], + &v_bit, &rnding); + u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit, + &rnding); + u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61], + &v_bit, &rnding); + u[62] = + half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], + &v_bit, &rnding); + tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53], + &v_bit, &rnding); + tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit, + &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], + &v_bit, &rnding); + u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], + &v_bit, &rnding); + u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit, + &rnding); + u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53], + &v_bit, &rnding); + u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, + &rnding); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding); + u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit, + &rnding); + tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit, + &rnding); + tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25], + &v_bit, &rnding); + u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25], + &v_bit, &rnding); + u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit, + &rnding); + u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29], + &v_bit, &rnding); + u[30] = + half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + u[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + u[9] = tmp1; + tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13], + &v_bit, &rnding); + u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], + &v_bit, &rnding); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit, + &rnding); + tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit, + &rnding); + tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit, + &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit, + &rnding); + u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], + &v_bit, &rnding); + u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], + &v_bit, &rnding); + u[60] = + half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); + u[61] = + half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], + &v_bit, &rnding); + tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], + &v_bit, &rnding); + tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], + &v_bit, &rnding); + u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], + &v_bit, &rnding); + u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], + &v_bit, &rnding); + u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, + &rnding); + u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, + &rnding); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, + &rnding); + u[6] = + half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding); + u[5] = tmp1; + addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29], + &v_bit, &rnding); + tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28], + &v_bit, &rnding); + tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26], + &v_bit, &rnding); + u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26], + &v_bit, &rnding); + u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27], + &v_bit, &rnding); + u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit, + &rnding); + u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit, + &rnding); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 9 + idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 10 + idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 11 + idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + + { + int32x4_t u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); + v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding); + v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding); + v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding); + v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding); + v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding); + v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding); + v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); + v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); + v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding); + v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding); + v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding); + v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding); + v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding); + v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding); + v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); + v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); + v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding); + v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding); + v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding); + v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding); + v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding); + v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding); + v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); + v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); + v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding); + v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding); + v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding); + v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding); + v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding); + v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding); + v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); + + // stage 3 + u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding); + u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding); + u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding); + u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding); + u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding); + u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding); + u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding); + u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding); + u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding); + u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding); + u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding); + u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding); + u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding); + u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding); + u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding); + u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding); + + for (i = 32; i < 64; i += 4) { + addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding); + v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding); + v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); + v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); + v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding); + v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding); + v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + + for (i = 16; i < 32; i += 4) { + addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], + &v_bit, &rnding); + v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], + &v_bit, &rnding); + v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58], + &v_bit, &rnding); + v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], + &v_bit, &rnding); + v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], + &v_bit, &rnding); + v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53], + &v_bit, &rnding); + v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50], + &v_bit, &rnding); + v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], + &v_bit, &rnding); + v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], + &v_bit, &rnding); + v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit, + &rnding); + v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53], + &v_bit, &rnding); + v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, + &rnding); + v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], + &v_bit, &rnding); + v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit, + &rnding); + v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61], + &v_bit, &rnding); + v[62] = + half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); + + // stage 5 + u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding); + u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding); + u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding); + u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding); + + for (i = 8; i < 16; i += 4) { + addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30], + &v_bit, &rnding); + u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29], + &v_bit, &rnding); + u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26], + &v_bit, &rnding); + u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25], + &v_bit, &rnding); + u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25], + &v_bit, &rnding); + u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit, + &rnding); + u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29], + &v_bit, &rnding); + u[30] = + half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding); + + for (i = 32; i < 64; i += 8) { + addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding); + v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding); + + addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], + &v_bit, &rnding); + v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], + &v_bit, &rnding); + v[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + + for (i = 16; i < 32; i += 8) { + addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], + &v_bit, &rnding); + v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], + &v_bit, &rnding); + v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], + &v_bit, &rnding); + v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], + &v_bit, &rnding); + v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], + &v_bit, &rnding); + v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], + &v_bit, &rnding); + v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], + &v_bit, &rnding); + v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], + &v_bit, &rnding); + v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], + &v_bit, &rnding); + v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], + &v_bit, &rnding); + v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, + &rnding); + v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, + &rnding); + v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], + &v_bit, &rnding); + v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], + &v_bit, &rnding); + v[60] = + half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); + v[61] = + half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); + + // stage 7 + addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, + &rnding); + u[6] = + half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding); + + addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29], + &v_bit, &rnding); + u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28], + &v_bit, &rnding); + u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27], + &v_bit, &rnding); + u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26], + &v_bit, &rnding); + u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26], + &v_bit, &rnding); + u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27], + &v_bit, &rnding); + u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit, + &rnding); + u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit, + &rnding); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], + &v_bit, &rnding); + v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], + &v_bit, &rnding); + v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit, + &rnding); + v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit, + &rnding); + + for (i = 16; i < 20; ++i) { + addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], + &v_bit, &rnding); + v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], + &v_bit, &rnding); + v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], + &v_bit, &rnding); + v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], + &v_bit, &rnding); + v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], + &v_bit, &rnding); + v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], + &v_bit, &rnding); + v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], + &v_bit, &rnding); + v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], + &v_bit, &rnding); + v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], + &v_bit, &rnding); + v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], + &v_bit, &rnding); + v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], + &v_bit, &rnding); + v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], + &v_bit, &rnding); + v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit, + &rnding); + v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit, + &rnding); + v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit, + &rnding); + v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit, + &rnding); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27], + &v_bit, &rnding); + u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26], + &v_bit, &rnding); + u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25], + &v_bit, &rnding); + u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24], + &v_bit, &rnding); + u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit, + &rnding); + u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit, + &rnding); + u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit, + &rnding); + u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit, + &rnding); + + for (i = 32; i < 40; i++) { + addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], + &v_bit, &rnding); + v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], + &v_bit, &rnding); + v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], + &v_bit, &rnding); + v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], + &v_bit, &rnding); + v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], + &v_bit, &rnding); + v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], + &v_bit, &rnding); + v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], + &v_bit, &rnding); + v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], + &v_bit, &rnding); + v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit, + &rnding); + v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit, + &rnding); + v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit, + &rnding); + v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit, + &rnding); + v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit, + &rnding); + v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit, + &rnding); + v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit, + &rnding); + v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit, + &rnding); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = + vdupq_n_s32((1 << (log_range_out - 1)) - 1); + for (i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, + 4); + } + } + } +} + +static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-1 + bf1 = in[0]; + + // stage 2-5 + bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding); + + // stage 6-9 + if (do_cols) { + bf1 = vmaxq_s32(bf1, clamp_lo); + bf1 = vminq_s32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift)); + } + } + + bf1 = vmaxq_s32(bf1, clamp_lo); + bf1 = vminq_s32(bf1, clamp_hi); + + for (int i = 0; i < 32; i++) out[i] = bf1; +} + +static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1[32]; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + // stage 0-1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding); + bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding); + bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding); + bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding); + bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding); + bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding); + bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding); + bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding); + + // stage 3 + bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding); + bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding); + + bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding); + bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding); + bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_neon(bf1, cospi, &v_bit, &rnding); + + // stage 5 + bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 7 + idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 8 + idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 9 + idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1[32]; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + // stage 0-1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding); + bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding); + bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding); + bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding); + bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding); + bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding); + bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding); + bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding); + bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding); + bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding); + bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding); + bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding); + bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding); + bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding); + bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding); + bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding); + + // stage 3 + bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding); + bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding); + bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding); + bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding); + bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding); + bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding); + bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding); + bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding); + + addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding); + bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding); + bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding); + bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding); + + addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_neon(bf1, cospi, &v_bit, &rnding); + + // stage 5 + bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding); + bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding); + + addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 6 + addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 7 + idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 8 + idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + // stage 9 + idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1[32], bf0[32]; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + for (int i = 0; i < 16; i++) bf0[i] = bf1[i]; + + bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31], + &v_bit, &rnding); + bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30], + &v_bit, &rnding); + bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29], + &v_bit, &rnding); + bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28], + &v_bit, &rnding); + bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27], + &v_bit, &rnding); + bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26], + &v_bit, &rnding); + bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25], + &v_bit, &rnding); + bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24], + &v_bit, &rnding); + bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit, + &rnding); + bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit, + &rnding); + bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit, + &rnding); + bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit, + &rnding); + bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit, + &rnding); + bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit, + &rnding); + bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit, + &rnding); + bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit, + &rnding); + + // stage 3 + for (int i = 0; i < 8; i++) bf1[i] = bf0[i]; + + bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15], + &v_bit, &rnding); + bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14], + &v_bit, &rnding); + bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13], + &v_bit, &rnding); + bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12], + &v_bit, &rnding); + bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit, + &rnding); + bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit, + &rnding); + bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit, + &rnding); + bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit, + &rnding); + + addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7], + &v_bit, &rnding); + bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6], + &v_bit, &rnding); + bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit, + &rnding); + bf0[7] = + half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding); + + addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30], + &v_bit, &rnding); + bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29], + &v_bit, &rnding); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26], + &v_bit, &rnding); + bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25], + &v_bit, &rnding); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25], + &v_bit, &rnding); + bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit, + &rnding); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29], + &v_bit, &rnding); + bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit, + &rnding); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit, + &rnding); + bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], + &v_bit, &rnding); + bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3], + &v_bit, &rnding); + bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit, + &rnding); + addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14], + &v_bit, &rnding); + bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13], + &v_bit, &rnding); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13], + &v_bit, &rnding); + bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit, + &rnding); + bf1[15] = bf0[15]; + addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], + &v_bit, &rnding); + bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit, + &rnding); + bf0[7] = bf1[7]; + addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29], + &v_bit, &rnding); + bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28], + &v_bit, &rnding); + bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27], + &v_bit, &rnding); + bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26], + &v_bit, &rnding); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26], + &v_bit, &rnding); + bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27], + &v_bit, &rnding); + bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit, + &rnding); + bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit, + &rnding); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], + &v_bit, &rnding); + bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], + &v_bit, &rnding); + bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit, + &rnding); + bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit, + &rnding); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], + &v_bit, &rnding); + bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], + &v_bit, &rnding); + bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], + &v_bit, &rnding); + bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], + &v_bit, &rnding); + bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit, + &rnding); + bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit, + &rnding); + bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit, + &rnding); + bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit, + &rnding); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + (void)bit; + for (int i = 0; i < 32; i += 16) { + out[i] = vshlq_n_s32(in[i], 2); + out[i + 1] = vshlq_n_s32(in[i + 1], 2); + out[i + 2] = vshlq_n_s32(in[i + 2], 2); + out[i + 3] = vshlq_n_s32(in[i + 3], 2); + out[i + 4] = vshlq_n_s32(in[i + 4], 2); + out[i + 5] = vshlq_n_s32(in[i + 5], 2); + out[i + 6] = vshlq_n_s32(in[i + 6], 2); + out[i + 7] = vshlq_n_s32(in[i + 7], 2); + out[i + 8] = vshlq_n_s32(in[i + 8], 2); + out[i + 9] = vshlq_n_s32(in[i + 9], 2); + out[i + 10] = vshlq_n_s32(in[i + 10], 2); + out[i + 11] = vshlq_n_s32(in[i + 11], 2); + out[i + 12] = vshlq_n_s32(in[i + 12], 2); + out[i + 13] = vshlq_n_s32(in[i + 13], 2); + out[i + 14] = vshlq_n_s32(in[i + 14], 2); + out[i + 15] = vshlq_n_s32(in[i + 15], 2); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +static const transform_1d_neon + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4x4_neon, NULL, NULL, NULL }, + { iadst4x4_neon, NULL, NULL, NULL }, + { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL }, + }, + { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL }, + { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL }, + { iidentity8_neon, iidentity8_neon, NULL, NULL } }, + { + { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL }, + { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL }, + { iidentity16_neon, NULL, iidentity16_neon, NULL }, + }, + { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon, + idct32x32_neon }, + { NULL, NULL, NULL, NULL }, + { iidentity32_neon, NULL, NULL, NULL } }, + { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon, + idct64x64_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_4X8; + int32x4_t buf1[32] = { vdupq_n_s32(0) }; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[8]; + load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); + load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); + round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); + + if (lr_flip) { + TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], + buf1[7]); + } else { + TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_8X4; + int32x4_t buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[8]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *buf1_ptr; + if (lr_flip) { + flip_buf_neon(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < 2; i++) { + int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + // write to buffer + highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row, + bd); +} + +void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_4X16; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + for (int i = 0; i < (txfm_size_row >> 2); i++) { + const int32_t *input_row = input + i * 4; + int32x4_t *buf0_cur = buf0 + i * 4; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); + row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_16X4; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *buf1_ptr; + if (lr_flip) { + flip_buf_neon(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, txfm_size_row, + bd); + } +} + +static void highbd_inv_txfm2d_add_4x16_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, int eob, + const int bd) { + (void)eob; + TX_SIZE tx_size = TX_4X16; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + const int32_t *input_row = input; + int32x4_t *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + for (int i = 0; i < (txfm_size_row >> 2); i++) { + row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, int eob, + const int bd) { + (void)eob; + TX_SIZE tx_size = TX_16X4; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + for (int j = 0; j < buf_size_w_div8; j++) { + TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j], + buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); + } + row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *buf1_ptr; + if (lr_flip) { + flip_buf_neon(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + INV_COS_BIT, 1, bd, 0); + } + round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, txfm_size_row, + bd); + } +} + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size) { + if (tx_size == 2) { + *eoby = 15, *eobx = 15; + } else if (tx_size == 3) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 4) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 7) { + *eoby = 15, *eobx = 7; + } else if (tx_size == 8) { + *eoby = 7, *eobx = 15; + } else if (tx_size == 9) { + *eoby = 31, *eobx = 15; + } else if (tx_size == 10) { + *eoby = 15, *eobx = 31; + } else if (tx_size == 11) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 12) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 15) { + *eoby = 31, *eobx = 7; + } else if (tx_size == 16) { + *eoby = 7, *eobx = 31; + } else if (tx_size == 17) { + *eoby = 31, *eobx = 15; + } else if (tx_size == 18) { + *eoby = 15, *eobx = 31; + } else { + *eoby = 0, *eobx = 0; + } +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size) { + const int txfm_size_row = tx_size_high[tx_size]; + *eoby = AOMMIN(32, txfm_size_row) - 1; + *eobx = 0; +} + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size) { + const int txfm_size_col = tx_size_wide[tx_size]; + *eobx = AOMMIN(32, txfm_size_col) - 1; + *eoby = 0; +} + +static void inv_txfm2d_add_h_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + const int bd) { + int32x4_t buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + assert(row_txfm != NULL); + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + assert(col_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + int32x4_t buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + int32x4_t *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} + +static void inv_txfm2d_add_v_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + const int bd) { + int32x4_t buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + assert(row_txfm != NULL); + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + assert(col_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + int32x4_t buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } + } +} + +static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, const int bd) { + int32x4_t buf1[64 * 4]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + assert(row_txfm != NULL); + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + assert(col_txfm != NULL); + for (int i = 0; i < (row_max >> 2); ++i) { + int32x4_t buf0[32]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = buf1 + i * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + int32x4_t *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, 0, txfm_size_row, bd); + } + } +} + +static void inv_txfm2d_add_no_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + const int bd) { + int32x4_t buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = txfm_size_col >> 2; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + int32x4_t buf0[64]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = &buf1[i * 4]; + + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + int32x4_t buf1[64 * 16]; + int eobx, eoby; + highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + int32x4_t buf0[64]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { + int32x4_t *buf0_cur = &buf0[j * 4]; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = &buf1[i * 4]; + + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case IDTX: + inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + default: assert(0); break; + } +} + +static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case IDTX: + inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type, + txfm_param->tx_size, txfm_param->eob, + bd); + break; + default: + av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} + +void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_4x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x4_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16, + bd); +} + +void av1_highbd_inv_txfm_add_4x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, eob, bd); +} + +void av1_highbd_inv_txfm_add_16x4_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, eob, bd); +} + +void av1_highbd_inv_txfm_add_8x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_8X16, txfm_param->eob, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X8, txfm_param->eob, txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8, + bd); +} + +void av1_highbd_inv_txfm_add_16x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X32, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_16X32, bd); +} + +void av1_highbd_inv_txfm_add_32x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X16, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_32X16, bd); +} + +void av1_highbd_inv_txfm_add_32x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X32, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_32X32, bd); +} + +void av1_highbd_inv_txfm_add_64x64_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_64X64, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_64X64, bd); +} + +void av1_highbd_inv_txfm_add_32x64_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X64, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_32X64, bd); +} + +void av1_highbd_inv_txfm_add_64x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_64X32, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_64X32, bd); +} + +void av1_highbd_inv_txfm_add_64x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_64X16, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_64X16, bd); +} + +void av1_highbd_inv_txfm_add_16x64_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X64, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_16X64, bd); +} + +void av1_highbd_inv_txfm_add_16x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X16, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_16X16, bd); +} + +void av1_highbd_inv_txfm_add_32x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X8, txfm_param->eob, txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8, + bd); +} + +void av1_highbd_inv_txfm_add_8x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_8X32, txfm_param->eob, txfm_param->bd); +} + +void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32, + bd); +} + +void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const TX_SIZE tx_size = txfm_param->tx_size; + + TX_TYPE tx_type = txfm_param->tx_type; + int bd = txfm_param->bd; + switch (tx_size) { + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_8X4: + av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_4X16: + av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_8X16: + av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X8: + av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X32: + av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X16: + av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X16: + av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X32: + av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_64X64: + av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X64: + av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_64X32: + av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X64: + av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_64X16: + av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X8: + av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_8X32: + av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + } +} diff --git a/third_party/aom/av1/common/arm/highbd_reconinter_neon.c b/third_party/aom/av1/common/arm/highbd_reconinter_neon.c new file mode 100644 index 0000000000..da7f6c57d0 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_reconinter_neon.c @@ -0,0 +1,327 @@ +/* + * + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" + +static INLINE void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse, + const uint16_t *src0, + int src0_stride, + const uint16_t *src1, + int src1_stride, int h, int w, + const unsigned int bd) { + assert(DIFF_FACTOR > 0); + uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA); + uint8x16_t mask_base = vdupq_n_u8(38); + uint8x16_t mask_diff = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA - 38); + + if (bd == 8) { + if (w >= 16) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0_lo = vld1q_u16(src0_ptr); + uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); + uint16x8_t s1_lo = vld1q_u16(src1_ptr); + uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); + + uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); + uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(mask_diff, diff); + } else { + m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); + } + + vst1q_u8(mask_ptr, m); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + width -= 16; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0 = vld1q_u16(src0_ptr); + uint16x8_t s1 = vld1q_u16(src1_ptr); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + vst1_u8(mask_ptr, m); + + src0_ptr += 8; + src1_ptr += 8; + mask_ptr += 8; + width -= 8; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + store_u8x4_strided_x2(mask, w, m); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + mask += 2 * w; + h -= 2; + } while (h != 0); + } + } else if (bd == 10) { + if (w >= 16) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0_lo = vld1q_u16(src0_ptr); + uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); + uint16x8_t s1_lo = vld1q_u16(src1_ptr); + uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); + + uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); + uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 2 + DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 2 + DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(mask_diff, diff); + } else { + m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); + } + + vst1q_u8(mask_ptr, m); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + width -= 16; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0 = vld1q_u16(src0_ptr); + uint16x8_t s1 = vld1q_u16(src1_ptr); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + vst1_u8(mask_ptr, m); + + src0_ptr += 8; + src1_ptr += 8; + mask_ptr += 8; + width -= 8; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + store_u8x4_strided_x2(mask, w, m); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + mask += 2 * w; + h -= 2; + } while (h != 0); + } + } else { + assert(bd == 12); + if (w >= 16) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0_lo = vld1q_u16(src0_ptr); + uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); + uint16x8_t s1_lo = vld1q_u16(src1_ptr); + uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); + + uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); + uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 4 + DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 4 + DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(mask_diff, diff); + } else { + m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); + } + + vst1q_u8(mask_ptr, m); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + width -= 16; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0 = vld1q_u16(src0_ptr); + uint16x8_t s1 = vld1q_u16(src1_ptr); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + vst1_u8(mask_ptr, m); + + src0_ptr += 8; + src1_ptr += 8; + mask_ptr += 8; + width -= 8; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + store_u8x4_strided_x2(mask, w, m); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + mask += 2 * w; + h -= 2; + } while (h != 0); + } + } +} + +void av1_build_compound_diffwtd_mask_highbd_neon( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + assert(h % 4 == 0); + assert(w % 4 == 0); + assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38); + + if (mask_type == DIFFWTD_38) { + diffwtd_mask_highbd_neon(mask, /*inverse=*/false, CONVERT_TO_SHORTPTR(src0), + src0_stride, CONVERT_TO_SHORTPTR(src1), + src1_stride, h, w, bd); + } else { // mask_type == DIFFWTD_38_INV + diffwtd_mask_highbd_neon(mask, /*inverse=*/true, CONVERT_TO_SHORTPTR(src0), + src0_stride, CONVERT_TO_SHORTPTR(src1), + src1_stride, h, w, bd); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_reconintra_neon.c b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c new file mode 100644 index 0000000000..170491b504 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/sum_neon.h" + +#define MAX_UPSAMPLE_SZ 16 + +void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) { + if (!strength) return; + assert(sz >= 0 && sz <= 129); + + DECLARE_ALIGNED(16, static const uint16_t, + idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 }; + const uint16x8_t index = vld1q_u16(idx); + + uint16_t edge[160]; // Max value of sz + enough padding for vector accesses. + memcpy(edge + 1, p, sz * sizeof(*p)); + + // Populate extra space appropriately. + edge[0] = edge[1]; + edge[sz + 1] = edge[sz]; + edge[sz + 2] = edge[sz]; + + // Don't overwrite first pixel. + uint16_t *dst = p + 1; + sz--; + + if (strength == 1) { // Filter: {4, 8, 4}. + const uint16_t *src = edge + 1; + + while (sz >= 8) { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + // Make use of the identity: + // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 + uint16x8_t t0 = vaddq_u16(s0, s2); + uint16x8_t t1 = vaddq_u16(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 2); + + vst1q_u16(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + // Make use of the identity: + // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 + uint16x8_t t0 = vaddq_u16(s0, s2); + uint16x8_t t1 = vaddq_u16(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 2); + + // Mask off out-of-bounds indices. + uint16x8_t current_dst = vld1q_u16(dst); + uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); + res = vbslq_u16(mask, res, current_dst); + + vst1q_u16(dst, res); + } + } else if (strength == 2) { // Filter: {5, 6, 5}. + const uint16_t *src = edge + 1; + + const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6), + vdupq_n_u16(5) } }; + while (sz >= 8) { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + uint16x8_t accum = vmulq_u16(s0, filter.val[0]); + accum = vmlaq_u16(accum, s1, filter.val[1]); + accum = vmlaq_u16(accum, s2, filter.val[2]); + uint16x8_t res = vrshrq_n_u16(accum, 4); + + vst1q_u16(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + uint16x8_t accum = vmulq_u16(s0, filter.val[0]); + accum = vmlaq_u16(accum, s1, filter.val[1]); + accum = vmlaq_u16(accum, s2, filter.val[2]); + uint16x8_t res = vrshrq_n_u16(accum, 4); + + // Mask off out-of-bounds indices. + uint16x8_t current_dst = vld1q_u16(dst); + uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); + res = vbslq_u16(mask, res, current_dst); + + vst1q_u16(dst, res); + } + } else { // Filter {2, 4, 4, 4, 2}. + const uint16_t *src = edge; + + while (sz >= 8) { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + uint16x8_t s4 = vld1q_u16(src + 4); + + // Make use of the identity: + // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 + uint16x8_t t0 = vaddq_u16(s0, s4); + uint16x8_t t1 = vaddq_u16(s1, s2); + t1 = vaddq_u16(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 3); + + vst1q_u16(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + uint16x8_t s4 = vld1q_u16(src + 4); + + // Make use of the identity: + // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 + uint16x8_t t0 = vaddq_u16(s0, s4); + uint16x8_t t1 = vaddq_u16(s1, s2); + t1 = vaddq_u16(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 3); + + // Mask off out-of-bounds indices. + uint16x8_t current_dst = vld1q_u16(dst); + uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); + res = vbslq_u16(mask, res, current_dst); + + vst1q_u16(dst, res); + } + } +} + +void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) { + if (!sz) return; + + assert(sz <= MAX_UPSAMPLE_SZ); + + uint16_t edge[MAX_UPSAMPLE_SZ + 3]; + const uint16_t *src = edge; + + // Copy p[-1..(sz-1)] and pad out both ends. + edge[0] = p[-1]; + edge[1] = p[-1]; + memcpy(edge + 2, p, sz * 2); + edge[sz + 2] = p[sz - 1]; + p[-2] = p[-1]; + + uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1); + + uint16_t *dst = p - 1; + + if (bd == 12) { + do { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + + uint16x8_t t0 = vaddq_u16(s1, s2); + uint16x8_t t1 = vaddq_u16(s0, s3); + uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9); + acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1))); + uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9); + acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1))); + + uint16x8x2_t res; + res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4)); + // Clamp pixel values at bitdepth maximum. + res.val[0] = vminq_u16(res.val[0], pixel_val_max); + res.val[1] = s2; + + vst2q_u16(dst, res); + + src += 8; + dst += 16; + sz -= 8; + } while (sz > 0); + } else { // Bit depth is 8 or 10. + do { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + + uint16x8_t t0 = vaddq_u16(s0, s3); + uint16x8_t t1 = vaddq_u16(s1, s2); + t1 = vmulq_n_u16(t1, 9); + t1 = vqsubq_u16(t1, t0); + + uint16x8x2_t res; + res.val[0] = vrshrq_n_u16(t1, 4); + // Clamp pixel values at bitdepth maximum. + res.val[0] = vminq_u16(res.val[0], pixel_val_max); + res.val[1] = s2; + + vst2q_u16(dst, res); + + src += 8; + dst += 16; + sz -= 8; + } while (sz > 0); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c new file mode 100644 index 0000000000..c6f1e3ad92 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, + int sx, int alpha) { + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = horizontal_add_4d_s32x4(m0123); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, + int sx, int alpha) { + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3)); + int32x4_t m4 = vmull_s16(vget_low_s16(f[4]), vget_low_s16(rv4)); + m4 = vmlal_s16(m4, vget_high_s16(f[4]), vget_high_s16(rv4)); + int32x4_t m5 = vmull_s16(vget_low_s16(f[5]), vget_low_s16(rv5)); + m5 = vmlal_s16(m5, vget_high_s16(f[5]), vget_high_s16(rv5)); + int32x4_t m6 = vmull_s16(vget_low_s16(f[6]), vget_low_s16(rv6)); + m6 = vmlal_s16(m6, vget_high_s16(f[6]), vget_high_s16(rv6)); + int32x4_t m7 = vmull_s16(vget_low_s16(f[7]), vget_low_s16(rv7)); + m7 = vmlal_s16(m7, vget_high_s16(f[7]), vget_high_s16(rv7)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); + int32x4_t res1 = horizontal_add_4d_s32x4(m4567); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, + int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = horizontal_add_4d_s32x4(m0123); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, + int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + int32x4_t m4 = vmull_s16(vget_low_s16(f), vget_low_s16(rv4)); + m4 = vmlal_s16(m4, vget_high_s16(f), vget_high_s16(rv4)); + int32x4_t m5 = vmull_s16(vget_low_s16(f), vget_low_s16(rv5)); + m5 = vmlal_s16(m5, vget_high_s16(f), vget_high_s16(rv5)); + int32x4_t m6 = vmull_s16(vget_low_s16(f), vget_low_s16(rv6)); + m6 = vmlal_s16(m6, vget_high_s16(f), vget_high_s16(rv6)); + int32x4_t m7 = vmull_s16(vget_low_s16(f), vget_low_s16(rv7)); + m7 = vmlal_s16(m7, vget_high_s16(f), vget_high_s16(rv7)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); + int32x4_t res1 = horizontal_add_4d_s32x4(m4567); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + return m0123; +} + +static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); + return (int32x4x2_t){ { m0123, m4567 } }; +} + +static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, + int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), + vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), + vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + return horizontal_add_4d_s32x4(m0123); +} + +static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, + int gamma) { + int16x8_t s0 = tmp[0]; + int16x8_t s1 = tmp[1]; + int16x8_t s2 = tmp[2]; + int16x8_t s3 = tmp[3]; + int16x8_t s4 = tmp[4]; + int16x8_t s5 = tmp[5]; + int16x8_t s6 = tmp[6]; + int16x8_t s7 = tmp[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); + m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); + int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); + m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); + int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); + m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); + int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); + m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + int32x4x2_t ret; + ret.val[0] = horizontal_add_4d_s32x4(m0123); + ret.val[1] = horizontal_add_4d_s32x4(m4567); + return ret; +} + +void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h new file mode 100644 index 0000000000..3b8982898e --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, + int sx, int alpha); + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, + int sx, int alpha); + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, + int sx); + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, + int sx); + +static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy); + +static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy); + +static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, + int gamma); + +static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, + int gamma); + +static INLINE int16x8_t load_filters_1(int ofs) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + return vld1q_s16(base + ofs0 * 8); +} + +static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); +} + +static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); + const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); + const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); + const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); + out[4] = vld1q_s16(base + ofs4 * 8); + out[5] = vld1q_s16(base + ofs5 * 8); + out[6] = vld1q_s16(base + ofs6 * 8); + out[7] = vld1q_s16(base + ofs7 * 8); +} + +static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) { + const int limit = (1 << bd) - 1; + return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); +} + +static INLINE void warp_affine_horizontal(const uint16_t *ref, int width, + int height, int stride, int p_width, + int16_t alpha, int16_t beta, int iy4, + int sx4, int ix4, int16x8_t tmp[], + int bd) { + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + + if (ix4 <= -7) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } else if (ix4 >= width + 6) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = + (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } + + static const uint16_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint16x8_t indx0 = vld1q_u16(kIotaArr); + const uint16x8_t indx1 = vld1q_u16(kIotaArr + 8); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + \ + if (out_of_boundary_left >= 0) { \ + uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); \ + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); \ + uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); \ + uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); \ + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \ + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \ + } \ + if (out_of_boundary_right >= 0) { \ + uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); \ + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); \ + uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); \ + uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); \ + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \ + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } +} + +static INLINE void highbd_vertical_filter_4x1_f4( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4_t sum0 = gamma == 0 ? vertical_filter_4x1_f1(tmp, sy) + : vertical_filter_4x1_f4(tmp, sy, gamma); + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + vst1_u16(dst16, res0); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + return; + } + + uint16x4_t p0 = vld1_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + vst1_u16(dst16, res0); +} + +static INLINE void highbd_vertical_filter_8x1_f8( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4x2_t sums = gamma == 0 ? vertical_filter_8x1_f1(tmp, sy) + : vertical_filter_8x1_f8(tmp, sy, gamma); + int32x4_t sum0 = sums.val[0]; + int32x4_t sum1 = sums.val[1]; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + vst1_u16(p + 4, vqmovun_s32(sum1)); + return; + } + + uint16x8_t p0 = vld1q_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); + int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec1 = vmulq_n_s32(p_vec1, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + p_vec1 = vhaddq_s32(p_vec1, sum1); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); + + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); +} + +static INLINE void warp_affine_vertical( + uint16_t *pred, int p_width, int p_height, int p_stride, int bd, + uint16_t *dst, int dst_stride, bool is_compound, bool do_average, + bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, + const int16x8_t *tmp, int i, int sy4, int j) { + int limit_height = p_height > 4 ? 8 : 4; + + if (p_width > 4) { + // p_width == 8 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_8x1_f8( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } else { + // p_width == 4 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_4x1_f4( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } +} + +static INLINE void highbd_warp_affine_common( + const int32_t *mat, const uint16_t *ref, int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const bool is_compound = conv_params->is_compound; + const bool do_average = conv_params->do_average; + const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fwd = conv_params->fwd_offset; + const int bwd = conv_params->bck_offset; + + assert(IMPLIES(is_compound, dst != NULL)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4 + p_col) << subsampling_x; + const int32_t src_y = (i + 4 + p_row) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Each horizontal filter result is formed by the sum of up to eight + // multiplications by filter values and then a shift. Although both the + // inputs and filters are loaded as int16, the input data is at most bd + // bits and the filters are at most 8 bits each. Additionally since we + // know all possible filter values we know that the sum of absolute + // filter values will fit in at most 9 bits. With this in mind we can + // conclude that the sum of each filter application will fit in bd + 9 + // bits. The shift following the summation is ROUND0_BITS (which is 3), + // +2 for 12-bit, which gives us a final storage of: + // bd == 8: ( 8 + 9) - 3 => 14 bits + // bd == 10: (10 + 9) - 3 => 16 bits + // bd == 12: (12 + 9) - 5 => 16 bits + // So it is safe to use int16x8_t as the intermediate storage type here. + int16x8_t tmp[15]; + + warp_affine_horizontal(ref, width, height, stride, p_width, alpha, beta, + iy4, sx4, ix4, tmp, bd); + warp_affine_vertical(pred, p_width, p_height, p_stride, bd, dst, + dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, tmp, + i, sy4, j); + } + } +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c new file mode 100644 index 0000000000..a6bd6d38e4 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/convolve.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#define HBD_WIENER_5TAP_HORIZ(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int16x8_t s04 = vaddq_s16(s0, s4); \ + int16x8_t s13 = vaddq_s16(s1, s3); \ + \ + /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \ + int32x4_t sum_lo = \ + vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \ + \ + int32x4_t sum_hi = \ + vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_5tap_horiz( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int width = w; \ + \ + do { \ + int16x8_t s0, s1, s2, s3, s4; \ + load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \ + \ + uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \ + s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += 8; \ + d += 8; \ + width -= 8; \ + } while (width != 0); \ + src_ptr += src_stride; \ + dst_ptr += dst_stride; \ + } while (--h != 0); \ + } + +HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS) +HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) + +#undef HBD_WIENER_5TAP_HORIZ + +#define HBD_WIENER_7TAP_HORIZ(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ + const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \ + const uint16x8_t im_max_val) { \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int16x8_t s06 = vaddq_s16(s0, s6); \ + int16x8_t s15 = vaddq_s16(s1, s5); \ + int16x8_t s24 = vaddq_s16(s2, s4); \ + \ + int32x4_t sum_lo = \ + vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \ + \ + int32x4_t sum_hi = \ + vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_7tap_horiz( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int width = w; \ + \ + do { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6; \ + load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \ + s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += 8; \ + d += 8; \ + width -= 8; \ + } while (width != 0); \ + src_ptr += src_stride; \ + dst_ptr += dst_stride; \ + } while (--h != 0); \ + } + +HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS) +HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) + +#undef HBD_WIENER_7TAP_HORIZ + +#define HBD_WIENER_5TAP_VERT(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ + const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)); \ + int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3)); \ + \ + /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \ + int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1); \ + sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0); \ + sum_lo = \ + vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1); \ + \ + int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)); \ + int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3)); \ + \ + int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1); \ + sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0); \ + sum_hi = \ + vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_5tap_vert( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int height = h; \ + \ + while (height > 3) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; \ + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); \ + \ + uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \ + s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \ + uint16x8_t d1 = name##_wiener_convolve5_8_2d_v( \ + s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val); \ + uint16x8_t d2 = name##_wiener_convolve5_8_2d_v( \ + s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + uint16x8_t d3 = name##_wiener_convolve5_8_2d_v( \ + s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ + \ + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ + \ + s += 4 * src_stride; \ + d += 4 * dst_stride; \ + height -= 4; \ + } \ + \ + while (height-- != 0) { \ + int16x8_t s0, s1, s2, s3, s4; \ + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); \ + \ + uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \ + s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += src_stride; \ + d += dst_stride; \ + } \ + \ + src_ptr += 8; \ + dst_ptr += 8; \ + w -= 8; \ + } while (w != 0); \ + } + +HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) +HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) + +#undef HBD_WIENER_5TAP_VERT + +#define HBD_WIENER_7TAP_VERT(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ + const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \ + const uint16x8_t res_max_val) { \ + const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ + const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \ + int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \ + int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \ + \ + int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \ + sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \ + sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \ + sum_lo = \ + vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \ + \ + int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \ + int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \ + int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \ + \ + int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \ + sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \ + sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \ + sum_hi = \ + vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_7tap_vert( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int height = h; \ + \ + while (height > 3) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \ + load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \ + &s8, &s9); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ + s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \ + s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ + uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \ + s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \ + uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \ + s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \ + \ + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ + \ + s += 4 * src_stride; \ + d += 4 * dst_stride; \ + height -= 4; \ + } \ + \ + while (height-- != 0) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6; \ + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ + s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += src_stride; \ + d += dst_stride; \ + } \ + \ + src_ptr += 8; \ + dst_ptr += 8; \ + w -= 8; \ + } while (w != 0); \ + } + +HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) +HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) + +#undef HBD_WIENER_7TAP_VERT + +static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) { + assert(filter[7] == 0); + if (filter[0] == 0 && filter[6] == 0) { + return WIENER_WIN_REDUCED; + } + return WIENER_WIN; +} + +void av1_highbd_wiener_convolve_add_src_neon( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4, + const int16_t *y_filter, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + (void)x_step_q4; + (void)y_step_q4; + + assert(w % 8 == 0); + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(x_filter[7] == 0 && y_filter[7] == 0); + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]); + + const int x_filter_taps = get_wiener_filter_taps(x_filter); + const int y_filter_taps = get_wiener_filter_taps(y_filter); + int16x4_t x_filter_s16 = vld1_s16(x_filter); + int16x4_t y_filter_s16 = vld1_s16(y_filter); + // Add 128 to tap 3. (Needed for rounding.) + x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48)); + y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48)); + + const int im_stride = MAX_SB_SIZE; + const int im_h = h + y_filter_taps - 1; + const int horiz_offset = x_filter_taps / 2; + const int vert_offset = (y_filter_taps / 2) * (int)src_stride; + + const int extraprec_clamp_limit = + WIENER_CLAMP_LIMIT(conv_params->round_0, bd); + const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1); + const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); + + const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1); + const int32x4_t vert_round_vec = + vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1))); + + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + if (bd == 12) { + if (x_filter_taps == WIENER_WIN_REDUCED) { + highbd_12_convolve_add_src_5tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } else { + highbd_12_convolve_add_src_7tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } + + if (y_filter_taps == WIENER_WIN_REDUCED) { + highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, + w, h, y_filter_s16, vert_round_vec, + res_max_val); + } else { + highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, + w, h, y_filter_s16, vert_round_vec, + res_max_val); + } + + } else { + if (x_filter_taps == WIENER_WIN_REDUCED) { + highbd_convolve_add_src_5tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } else { + highbd_convolve_add_src_7tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } + + if (y_filter_taps == WIENER_WIN_REDUCED) { + highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w, + h, y_filter_s16, vert_round_vec, + res_max_val); + } else { + highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w, + h, y_filter_s16, vert_round_vec, + res_max_val); + } + } +} diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c new file mode 100644 index 0000000000..2b0274cc64 --- /dev/null +++ b/third_party/aom/av1/common/arm/reconinter_neon.c @@ -0,0 +1,217 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "config/av1_rtcd.h" + +static AOM_INLINE void diffwtd_mask_d16_neon( + uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round)); + + if (w >= 16) { + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0_lo = vld1q_u16(src0 + j); + uint16x8_t s1_lo = vld1q_u16(src1 + j); + uint16x8_t s0_hi = vld1q_u16(src0 + j + 8); + uint16x8_t s1_hi = vld1q_u16(src1 + j + 8); + + uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec); + uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + j += 16; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + } while (++i < h); + } else if (w == 8) { + int i = 0; + do { + uint16x8_t s0 = vld1q_u16(src0); + uint16x8_t s1 = vld1q_u16(src1); + + uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0 + } else { + m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64)); + } + + vst1_u8(mask, m); + + mask += 8; + src0 += src0_stride; + src1 += src1_stride; + } while (++i < h); + } else if (w == 4) { + int i = 0; + do { + uint16x8_t s0 = + vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride)); + uint16x8_t s1 = + vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride)); + + uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0 + } else { + m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64)); + } + + vst1_u8(mask, m); + + mask += 8; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + i += 2; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_neon( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + assert(h >= 4); + assert(w >= 4); + assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38)); + + if (mask_type == DIFFWTD_38) { + diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1, + src1_stride, h, w, conv_params, bd); + } else { // mask_type == DIFFWTD_38_INV + diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1, + src1_stride, h, w, conv_params, bd); + } +} + +static AOM_INLINE void diffwtd_mask_neon(uint8_t *mask, const bool inverse, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + if (w >= 16) { + int i = 0; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src0 + j); + uint8x16_t s1 = vld1q_u8(src1 + j); + + uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + j += 16; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + } while (++i < h); + } else if (w == 8) { + int i = 0; + do { + uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride)); + uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride)); + + uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + i += 2; + } while (i < h); + } else if (w == 4) { + int i = 0; + do { + uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride); + uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride); + + uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + src0 += 4 * src0_stride; + src1 += 4 * src1_stride; + i += 4; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_neon(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + assert(h % 4 == 0); + assert(w % 4 == 0); + assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38); + + if (mask_type == DIFFWTD_38) { + diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1, + src1_stride, h, w); + } else { // mask_type == DIFFWTD_38_INV + diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1, + src1_stride, h, w); + } +} diff --git a/third_party/aom/av1/common/arm/reconintra_neon.c b/third_party/aom/av1/common/arm/reconintra_neon.c new file mode 100644 index 0000000000..3db39987a6 --- /dev/null +++ b/third_party/aom/av1/common/arm/reconintra_neon.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +#define MAX_UPSAMPLE_SZ 16 + +// These kernels are a transposed version of those defined in reconintra.c, +// with the absolute value of the negatives taken in the top row. +DECLARE_ALIGNED(16, const uint8_t, + av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = { + // clang-format off + { + { 6, 5, 3, 3, 4, 3, 3, 3 }, + { 10, 2, 1, 1, 6, 2, 2, 1 }, + { 0, 10, 1, 1, 0, 6, 2, 2 }, + { 0, 0, 10, 2, 0, 0, 6, 2 }, + { 0, 0, 0, 10, 0, 0, 0, 6 }, + { 12, 9, 7, 5, 2, 2, 2, 3 }, + { 0, 0, 0, 0, 12, 9, 7, 5 } + }, + { + { 10, 6, 4, 2, 10, 6, 4, 2 }, + { 16, 0, 0, 0, 16, 0, 0, 0 }, + { 0, 16, 0, 0, 0, 16, 0, 0 }, + { 0, 0, 16, 0, 0, 0, 16, 0 }, + { 0, 0, 0, 16, 0, 0, 0, 16 }, + { 10, 6, 4, 2, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 10, 6, 4, 2 } + }, + { + { 8, 8, 8, 8, 4, 4, 4, 4 }, + { 8, 0, 0, 0, 4, 0, 0, 0 }, + { 0, 8, 0, 0, 0, 4, 0, 0 }, + { 0, 0, 8, 0, 0, 0, 4, 0 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 16, 16, 16, 16, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16, 16, 16, 16 } + }, + { + { 2, 1, 1, 0, 1, 1, 1, 1 }, + { 8, 3, 2, 1, 4, 3, 2, 2 }, + { 0, 8, 3, 2, 0, 4, 3, 2 }, + { 0, 0, 8, 3, 0, 0, 4, 3 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 10, 6, 4, 2, 3, 4, 4, 3 }, + { 0, 0, 0, 0, 10, 6, 4, 3 } + }, + { + { 12, 10, 9, 8, 10, 9, 8, 7 }, + { 14, 0, 0, 0, 12, 1, 0, 0 }, + { 0, 14, 0, 0, 0, 12, 0, 0 }, + { 0, 0, 14, 0, 0, 0, 12, 1 }, + { 0, 0, 0, 14, 0, 0, 0, 12 }, + { 14, 12, 11, 10, 0, 0, 1, 1 }, + { 0, 0, 0, 0, 14, 12, 11, 9 } + } + // clang-format on +}; + +#define FILTER_INTRA_SCALE_BITS 4 + +void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(width <= 32 && height <= 32); + + const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]); + const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]); + const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]); + const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]); + const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]); + const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]); + const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]); + + uint8_t buffer[33][33]; + // Populate the top row in the scratch buffer with data from above. + memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t)); + // Populate the first column in the scratch buffer with data from the left. + int r = 0; + do { + buffer[r + 1][0] = left[r]; + } while (++r < height); + + // Computing 4 cols per iteration (instead of 8) for 8x blocks is faster. + if (width <= 8) { + r = 1; + do { + int c = 1; + uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1 = vdup_lane_u8(s1234, 0); + uint8x8_t s2 = vdup_lane_u8(s1234, 1); + uint8x8_t s3 = vdup_lane_u8(s1234, 2); + uint8x8_t s4 = vdup_lane_u8(s1234, 3); + + uint16x8_t sum = vmull_u8(s1, f1); + // First row of each filter has all negative values so subtract. + sum = vmlsl_u8(sum, s0, f0); + sum = vmlal_u8(sum, s2, f2); + sum = vmlal_u8(sum, s3, f3); + sum = vmlal_u8(sum, s4, f4); + sum = vmlal_u8(sum, s5, f5); + sum = vmlal_u8(sum, s6, f6); + + uint8x8_t res = + vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS); + + // Store buffer[r + 0][c] and buffer[r + 1][c]. + store_u8x4_strided_x2(&buffer[r][c], 33, res); + + store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res); + + s0 = s4; + s5 = vdup_lane_u8(res, 3); + s6 = vdup_lane_u8(res, 7); + c += 4; + } while (c < width + 1); + + r += 2; + } while (r < height + 1); + } else { + r = 1; + do { + int c = 1; + uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); + uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); + uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); + uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); + + uint16x8_t sum_lo = vmull_u8(s1_lo, f1); + // First row of each filter has all negative values so subtract. + sum_lo = vmlsl_u8(sum_lo, s0_lo, f0); + sum_lo = vmlal_u8(sum_lo, s2_lo, f2); + sum_lo = vmlal_u8(sum_lo, s3_lo, f3); + sum_lo = vmlal_u8(sum_lo, s4_lo, f4); + sum_lo = vmlal_u8(sum_lo, s5_lo, f5); + sum_lo = vmlal_u8(sum_lo, s6_lo, f6); + + uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo), + FILTER_INTRA_SCALE_BITS); + + uint8x8_t s0_hi = s4_lo; + uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); + uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); + uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); + uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); + uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); + uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); + + uint16x8_t sum_hi = vmull_u8(s1_hi, f1); + // First row of each filter has all negative values so subtract. + sum_hi = vmlsl_u8(sum_hi, s0_hi, f0); + sum_hi = vmlal_u8(sum_hi, s2_hi, f2); + sum_hi = vmlal_u8(sum_hi, s3_hi, f3); + sum_hi = vmlal_u8(sum_hi, s4_hi, f4); + sum_hi = vmlal_u8(sum_hi, s5_hi, f5); + sum_hi = vmlal_u8(sum_hi, s6_hi, f6); + + uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi), + FILTER_INTRA_SCALE_BITS); + + uint32x2x2_t res = + vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); + + vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0])); + vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1])); + + vst1_u8(dst + (r - 1) * stride + c - 1, + vreinterpret_u8_u32(res.val[0])); + vst1_u8(dst + (r + 0) * stride + c - 1, + vreinterpret_u8_u32(res.val[1])); + + s0_lo = s4_hi; + s5_lo = vdup_lane_u8(res_hi, 3); + s6_lo = vdup_lane_u8(res_hi, 7); + c += 8; + } while (c < width + 1); + + r += 2; + } while (r < height + 1); + } +} + +void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) { + if (!strength) return; + assert(sz >= 0 && sz <= 129); + + uint8_t edge[160]; // Max value of sz + enough padding for vector accesses. + memcpy(edge + 1, p, sz * sizeof(*p)); + + // Populate extra space appropriately. + edge[0] = edge[1]; + edge[sz + 1] = edge[sz]; + edge[sz + 2] = edge[sz]; + + // Don't overwrite first pixel. + uint8_t *dst = p + 1; + sz--; + + if (strength == 1) { // Filter: {4, 8, 4}. + const uint8_t *src = edge + 1; + + while (sz >= 8) { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + // Make use of the identity: + // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 + uint16x8_t t0 = vaddl_u8(s0, s2); + uint16x8_t t1 = vaddl_u8(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 2); + + vst1_u8(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + uint16x8_t t0 = vaddl_u8(s0, s2); + uint16x8_t t1 = vaddl_u8(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 2); + + // Mask off out-of-bounds indices. + uint8x8_t current_dst = vld1_u8(dst); + uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); + res = vbsl_u8(mask, res, current_dst); + + vst1_u8(dst, res); + } + } else if (strength == 2) { // Filter: {5, 6, 5}. + const uint8_t *src = edge + 1; + + const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } }; + + while (sz >= 8) { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + uint16x8_t accum = vmull_u8(s0, filter.val[0]); + accum = vmlal_u8(accum, s1, filter.val[1]); + accum = vmlal_u8(accum, s2, filter.val[2]); + uint8x8_t res = vrshrn_n_u16(accum, 4); + + vst1_u8(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + uint16x8_t accum = vmull_u8(s0, filter.val[0]); + accum = vmlal_u8(accum, s1, filter.val[1]); + accum = vmlal_u8(accum, s2, filter.val[2]); + uint8x8_t res = vrshrn_n_u16(accum, 4); + + // Mask off out-of-bounds indices. + uint8x8_t current_dst = vld1_u8(dst); + uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); + res = vbsl_u8(mask, res, current_dst); + + vst1_u8(dst, res); + } + } else { // Filter {2, 4, 4, 4, 2}. + const uint8_t *src = edge; + + while (sz >= 8) { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + uint8x8_t s3 = vld1_u8(src + 3); + uint8x8_t s4 = vld1_u8(src + 4); + + // Make use of the identity: + // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 + uint16x8_t t0 = vaddl_u8(s0, s4); + uint16x8_t t1 = vaddl_u8(s1, s2); + t1 = vaddw_u8(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 3); + + vst1_u8(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + uint8x8_t s3 = vld1_u8(src + 3); + uint8x8_t s4 = vld1_u8(src + 4); + + uint16x8_t t0 = vaddl_u8(s0, s4); + uint16x8_t t1 = vaddl_u8(s1, s2); + t1 = vaddw_u8(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 3); + + // Mask off out-of-bounds indices. + uint8x8_t current_dst = vld1_u8(dst); + uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); + res = vbsl_u8(mask, res, current_dst); + + vst1_u8(dst, res); + } + } +} + +void av1_upsample_intra_edge_neon(uint8_t *p, int sz) { + if (!sz) return; + + assert(sz <= MAX_UPSAMPLE_SZ); + + uint8_t edge[MAX_UPSAMPLE_SZ + 3]; + const uint8_t *src = edge; + + // Copy p[-1..(sz-1)] and pad out both ends. + edge[0] = p[-1]; + edge[1] = p[-1]; + memcpy(edge + 2, p, sz); + edge[sz + 2] = p[sz - 1]; + p[-2] = p[-1]; + + uint8_t *dst = p - 1; + + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + uint8x8_t s3 = vld1_u8(src + 3); + + int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3)); + int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2)); + t1 = vmulq_n_s16(t1, 9); + t1 = vsubq_s16(t1, t0); + + uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } }; + + vst2_u8(dst, res); + + src += 8; + dst += 16; + sz -= 8; + } while (sz > 0); +} diff --git a/third_party/aom/av1/common/arm/resize_neon.c b/third_party/aom/av1/common/arm/resize_neon.c new file mode 100644 index 0000000000..b00ebd1fc2 --- /dev/null +++ b/third_party/aom/av1/common/arm/resize_neon.c @@ -0,0 +1,1178 @@ +/* + * + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/resize.h" +#include "config/av1_rtcd.h" +#include "config/aom_scale_rtcd.h" + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); + sum = vmla_lane_s16(sum, s1, filter_lo, 1); + sum = vmla_lane_s16(sum, s2, filter_lo, 2); + sum = vmla_lane_s16(sum, s5, filter_hi, 1); + sum = vmla_lane_s16(sum, s6, filter_hi, 2); + sum = vmla_lane_s16(sum, s7, filter_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); + return vqrshrun_n_s16(sum, 7); +} + +static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, + const int16x8_t filter) { + int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0])); + int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1])); + int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2])); + int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3])); + int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4])); + int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5])); + int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6])); + int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7])); + + return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter); +} + +static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x2_t s = vld2q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x4_t s = vld4q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_bilinear_kernel( + const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2, + const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1, + uint8_t *const dst) { + const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0); + const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0); + const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0); + const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0); + const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1); + const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1); + const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1); + const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1); + + const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07 + const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F + const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17 + const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F + const uint16x8_t v0 = vmull_u8(hor0, coef0); + const uint16x8_t v1 = vmull_u8(hor1, coef0); + const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1); + const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1); + // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F + const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7)); + vst1q_u8(dst, d); +} + +static INLINE void scale_plane_2_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E + // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F + const uint8x16x2_t s0 = vld2q_u8(src0); + // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E + // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F + const uint8x16x2_t s1 = vld2q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 32; + src1 += 32; + dst += 16; + x -= 16; + } while (x); + src0 += 2 * (src_stride - max_width); + src1 += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // (*) -- useless + // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C + // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D + // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*) + // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*) + const uint8x16x4_t s0 = vld4q_u8(src0); + // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C + // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D + // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*) + // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*) + const uint8x16x4_t s1 = vld4q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 64; + src1 += 64; + dst += 16; + x -= 16; + } while (x); + src0 += 4 * (src_stride - max_width); + src1 += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[14], d[4]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + // Note: processing 4x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + transpose_elems_inplace_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71 + d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72 + d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73 + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + transpose_elems_inplace_u8_8x4(&d[0], &d[1], &d[2], &d[3]); + vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]), + 0); + vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]), + 0); + vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]), + 0); + vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]), + 0); + vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]), + 1); + vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]), + 1); + vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]), + 1); + vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]), + 1); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + t += 4; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 6 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17 + d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27 + d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[12], d[2]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + // Note: processing 2x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_elems_u8_4x8(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], + &s[0], &s[1], &s[2], &s[3]); + x = width_hor; + + do { + uint8x8x2_t dd; + src += 8; + load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + transpose_elems_inplace_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71 + // dd.val[0]: 00 01 20 21 40 41 60 61 + // dd.val[1]: 10 11 30 31 50 51 70 71 + dd = vtrn_u8(d[0], d[1]); + vst1_lane_u16((uint16_t *)(t + 0 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 0); + vst1_lane_u16((uint16_t *)(t + 1 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 0); + vst1_lane_u16((uint16_t *)(t + 2 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 1); + vst1_lane_u16((uint16_t *)(t + 3 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 1); + vst1_lane_u16((uint16_t *)(t + 4 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 2); + vst1_lane_u16((uint16_t *)(t + 5 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 2); + vst1_lane_u16((uint16_t *)(t + 6 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 3); + vst1_lane_u16((uint16_t *)(t + 7 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 3); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + t += 2; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]); + t += 4 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, + const uint8x8_t *const coef) { + const uint16x8_t h0 = vmull_u8(s[0], coef[0]); + const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); + + return vrshrn_n_u16(h1, 7); +} + +// Notes for 4 to 3 scaling: +// +// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be +// multiple of 6, and no less than w. +// +// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be +// multiple of 8, and no less than w. +// +// 3. 8 columns are calculated in each horizontal inner loop for further +// vertical scaling, so height_hor must be multiple of 8, and no less than +// 4 * h / 3. +// +// 4. 6 columns are calculated in each vertical inner loop, so height_ver must +// be multiple of 6, and no less than h. +// +// 5. The physical location of the last row of the 4 to 3 scaled frame is +// decided by phase_scaler, and are always less than 1 pixel below the last row +// of the original image. +static void scale_plane_4_to_3_bilinear(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We only need 1 extra row below because there are only 2 bilinear + // coefficients. + const int height_hor = (4 * h / 3 + 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[9], d[8], c[6]; + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr; + assert(w && h); + + c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]); + c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]); + c[2] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]); + c[3] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]); + c[4] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]); + c[5] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]); + + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + src += 1; + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + x = width_hor; + + do { + load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + src += 8; + transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], + &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3 - 1; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + t += 8 * stride_hor; + + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 1); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + const int16x8_t filters0 = vld1q_s16( + (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters1 = vld1q_s16( + (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters2 = vld1q_s16( + (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[15], d[8]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13], &s[14]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], + &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 7 * stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + t += 8 * stride_hor; + + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 7); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON. +static INLINE bool has_normative_scaler_neon(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + const bool has_normative_scaler = + (2 * dst_width == src_width && 2 * dst_height == src_height) || + (4 * dst_width == src_width && 4 * dst_height == src_height) || + (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); + + return has_normative_scaler; +} + +void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase, const int num_planes) { + bool has_normative_scaler = + has_normative_scaler_neon(src->y_crop_width, src->y_crop_height, + dst->y_crop_width, dst->y_crop_height); + + if (num_planes > 1) { + has_normative_scaler = + has_normative_scaler && + has_normative_scaler_neon(src->uv_crop_width, src->uv_crop_height, + dst->uv_crop_width, dst->uv_crop_height); + } + + if (!has_normative_scaler) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + return; + } + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + int malloc_failed = 0; + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; + const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + if (phase == 0) { + scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0, c1); + } else { + const int buffer_stride = (dst_y_w + 3) & ~3; + const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + if (phase == 0) { + scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0, c1); + } else { + const int buffer_stride = (dst_y_w + 1) & ~1; + const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else { + assert(4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h); + // 4 to 3 + const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; + const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + if (filter == BILINEAR) { + scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, phase, temp_buffer); + } else { + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel, phase, temp_buffer); + } + free(temp_buffer); + } + } + + if (malloc_failed) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + } else { + aom_extend_frame_borders(dst, num_planes); + } +} + +static INLINE void scaledconvolve_horiz_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + + src -= SUBPEL_TAPS / 2 - 1; + + y = h; + do { + int x_q4 = x0_q4; + x = 0; + do { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + int16x8_t ss[4]; + int16x4_t t[8], tt; + + load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); + transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]); + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + t[0] = vget_low_s16(ss[0]); + t[1] = vget_low_s16(ss[1]); + t[2] = vget_low_s16(ss[2]); + t[3] = vget_low_s16(ss[3]); + t[4] = vget_high_s16(ss[0]); + t[5] = vget_high_s16(ss[1]); + t[6] = vget_high_s16(ss[2]); + t[7] = vget_high_s16(ss[3]); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], + filters); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + store_u8_4x1(&temp[4 * z], d); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + { + const uint8x8x4_t d4 = vld4_u8(temp); + store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]); + store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]); + store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]); + store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]); + } + x += 4; + } while (x < w); + + src += src_stride * 4; + dst += dst_stride * 4; + y -= 4; + } while (y > 0); +} + +static INLINE void scaledconvolve_horiz_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = (h + 7) & ~7; + + do { + int x_q4 = x0_q4; + x = 0; + do { + uint8x8_t d[8]; + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8]; + load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + d[0] = scale_filter_8(s, filters); + vst1_u8(&temp[8 * z], d[0]); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], + &d[6], &d[7]); + store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5], + d[6], d[7]); + x += 8; + } while (x < w); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static INLINE void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + int16x4_t t[8], tt; + + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); + t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); + t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); + t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); + t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); + t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); + t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); + t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + store_u8_4x1(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + d = scale_filter_8(s, filters); + vst1_u8(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int x, y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + x = 0; + do { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x16_t ss[8]; + uint8x8_t s[8], d[2]; + load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], + &ss[5], &ss[6], &ss[7]); + s[0] = vget_low_u8(ss[0]); + s[1] = vget_low_u8(ss[1]); + s[2] = vget_low_u8(ss[2]); + s[3] = vget_low_u8(ss[3]); + s[4] = vget_low_u8(ss[4]); + s[5] = vget_low_u8(ss[5]); + s[6] = vget_low_u8(ss[6]); + s[7] = vget_low_u8(ss[7]); + d[0] = scale_filter_8(s, filters); + + s[0] = vget_high_u8(ss[0]); + s[1] = vget_high_u8(ss[1]); + s[2] = vget_high_u8(ss[2]); + s[3] = vget_high_u8(ss[3]); + s[4] = vget_high_u8(ss[4]); + s[5] = vget_high_u8(ss[5]); + s[6] = vget_high_u8(ss[6]); + s[7] = vget_high_u8(ss[7]); + d[1] = scale_filter_8(s, filters); + vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); + src_y += 16; + x += 16; + } while (x < w); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c new file mode 100644 index 0000000000..1d3a3cc038 --- /dev/null +++ b/third_party/aom/av1/common/arm/selfguided_neon.c @@ -0,0 +1,1595 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/common.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" + +// Constants used for right shift in final_filter calculation. +#define NB_EVEN 5 +#define NB_ODD 4 + +static INLINE void calc_ab_fast_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5, + int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec, + uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec, + uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2, + const int buf_stride) { + uint32x4_t q0, q1, q2, q3; + uint32x4_t p0, p1, p2, p3; + uint16x4_t d0, d1, d2, d3; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + + q0 = vmulq_u32(s4, s4); + q1 = vmulq_u32(s5, s5); + q2 = vmulq_u32(s6, s6); + q3 = vmulq_u32(s7, s7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 4; y++) { + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); + } + p0 = vsubl_u16(sgrproj_sgr, d0); + p1 = vsubl_u16(sgrproj_sgr, d1); + p2 = vsubl_u16(sgrproj_sgr, d2); + p3 = vsubl_u16(sgrproj_sgr, d3); + + s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec); + s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec); + s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec); + s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec); + + s4 = vmulq_u32(s4, p0); + s5 = vmulq_u32(s5, p1); + s6 = vmulq_u32(s6, p2); + s7 = vmulq_u32(s7, p3); + + p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); +} +static INLINE void calc_ab_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0, + uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4, + uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7, + uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val, + uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1, + uint16_t *dst_A16, int32_t *dst2, const int buf_stride) { + uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7; + uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7; + uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + s4 = vmulq_u32(s4, const_n_val); + s5 = vmulq_u32(s5, const_n_val); + s6 = vmulq_u32(s6, const_n_val); + s7 = vmulq_u32(s7, const_n_val); + + d0 = vget_low_u16(s16_4); + d1 = vget_low_u16(s16_5); + d2 = vget_low_u16(s16_6); + d3 = vget_low_u16(s16_7); + d4 = vget_high_u16(s16_4); + d5 = vget_high_u16(s16_5); + d6 = vget_high_u16(s16_6); + d7 = vget_high_u16(s16_7); + + q0 = vmull_u16(d0, d0); + q1 = vmull_u16(d1, d1); + q2 = vmull_u16(d2, d2); + q3 = vmull_u16(d3, d3); + q4 = vmull_u16(d4, d4); + q5 = vmull_u16(d5, d5); + q6 = vmull_u16(d6, d6); + q7 = vmull_u16(d7, d7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + p4 = vcleq_u32(q4, s4); + p5 = vcleq_u32(q5, s5); + p6 = vcleq_u32(q6, s6); + p7 = vcleq_u32(q7, s7); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + q4 = vsubq_u32(s4, q4); + q5 = vsubq_u32(s5, q5); + q6 = vsubq_u32(s6, q6); + q7 = vsubq_u32(s7, q7); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + p4 = vandq_u32(p4, q4); + p5 = vandq_u32(p5, q5); + p6 = vandq_u32(p6, q6); + p7 = vandq_u32(p7, q7); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + p4 = vmulq_u32(p4, s_vec); + p5 = vmulq_u32(p5, s_vec); + p6 = vmulq_u32(p6, s_vec); + p7 = vmulq_u32(p7, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS); + p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS); + p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS); + p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + p4 = vminq_u32(p4, const_val); + p5 = vminq_u32(p5, const_val); + p6 = vminq_u32(p6, const_val); + p7 = vminq_u32(p7, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 8; y++) { + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); + } + + s16_4 = vsubq_u16(sgrproj_sgr, s16_4); + s16_5 = vsubq_u16(sgrproj_sgr, s16_5); + s16_6 = vsubq_u16(sgrproj_sgr, s16_6); + s16_7 = vsubq_u16(sgrproj_sgr, s16_7); + + s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec); + s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec); + s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec); + s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec); + s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec); + s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec); + s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec); + s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec); + + s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4))); + s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5))); + s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6))); + s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7))); + s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4))); + s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5))); + s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6))); + s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7))); + + p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS); + p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); + store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4), + vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6), + vreinterpretq_s32_u32(p7)); +} + +static INLINE void boxsum2_square_sum_calc( + int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5, + int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10, + int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) { + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; + int32x4_t r12, r34, r67, r89, r1011; + int32x4_t r345, r6789, r789; + + d1 = vmull_s16(t1, t1); + d2 = vmull_s16(t2, t2); + d3 = vmull_s16(t3, t3); + d4 = vmull_s16(t4, t4); + d5 = vmull_s16(t5, t5); + d6 = vmull_s16(t6, t6); + d7 = vmull_s16(t7, t7); + d8 = vmull_s16(t8, t8); + d9 = vmull_s16(t9, t9); + d10 = vmull_s16(t10, t10); + d11 = vmull_s16(t11, t11); + + r12 = vaddq_s32(d1, d2); + r34 = vaddq_s32(d3, d4); + r67 = vaddq_s32(d6, d7); + r89 = vaddq_s32(d8, d9); + r1011 = vaddq_s32(d10, d11); + r345 = vaddq_s32(r34, d5); + r6789 = vaddq_s32(r67, r89); + r789 = vsubq_s32(r6789, d6); + *r0 = vaddq_s32(r12, r345); + *r1 = vaddq_s32(r67, r345); + *r2 = vaddq_s32(d5, r6789); + *r3 = vaddq_s32(r789, r1011); +} + +static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, + int32_t *dst32, int32_t *dst2, const int dst_stride, + const int width, const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *dst1_16_ptr, *src_ptr; + int32_t *dst2_ptr; + int h, w, count = 0; + const int dst_stride_2 = (dst_stride << 1); + const int dst_stride_8 = (dst_stride << 3); + + dst1_16_ptr = dst16; + dst2_ptr = dst2; + src_ptr = src; + w = width; + { + int16x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t t8, t9, t10, t11, t12; + + int16x8_t q12345, q56789, q34567, q7891011; + int16x8_t q12, q34, q67, q89, q1011; + int16x8_t q345, q6789, q789; + + int32x4_t r12345, r56789, r34567, r7891011; + + do { + h = height; + dst1_16_ptr = dst16 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + + dst1_16_ptr += dst_stride_2; + dst2_ptr += dst_stride_2; + do { + load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12); + + q12 = vaddq_s16(t1, t2); + q34 = vaddq_s16(t3, t4); + q67 = vaddq_s16(t6, t7); + q89 = vaddq_s16(t8, t9); + q1011 = vaddq_s16(t10, t11); + q345 = vaddq_s16(q34, t5); + q6789 = vaddq_s16(q67, q89); + q789 = vaddq_s16(q89, t7); + q12345 = vaddq_s16(q12, q345); + q34567 = vaddq_s16(q67, q345); + q56789 = vaddq_s16(t5, q6789); + q7891011 = vaddq_s16(q789, q1011); + + store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789, + q7891011); + dst1_16_ptr += dst_stride_8; + + boxsum2_square_sum_calc( + vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3), + vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6), + vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9), + vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011); + + boxsum2_square_sum_calc( + vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3), + vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6), + vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9), + vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789, + r7891011); + dst2_ptr += (dst_stride_8); + h -= 8; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst16, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst16, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } + } + + { + int16x4_t s1, s2, s3, s4, s5, s6, s7, s8; + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int32x4_t q12345, q34567, q23456, q45678; + int32x4_t q23, q45, q67; + int32x4_t q2345, q4567; + + int32x4_t r12345, r34567, r23456, r45678; + int32x4_t r23, r45, r67; + int32x4_t r2345, r4567; + + int32_t *src2_ptr, *dst1_32_ptr; + int16_t *src1_ptr; + count = 0; + h = height; + do { + dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2); + dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2); + src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + w = width; + + dst1_32_ptr += 2; + dst2_ptr += 2; + load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4); + transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4); + load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4); + transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4); + do { + src1_ptr += 4; + src2_ptr += 4; + load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8); + transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8); + load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8); + transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8); + q23 = vaddl_s16(s2, s3); + q45 = vaddl_s16(s4, s5); + q67 = vaddl_s16(s6, s7); + q2345 = vaddq_s32(q23, q45); + q4567 = vaddq_s32(q45, q67); + q12345 = vaddq_s32(vmovl_s16(s1), q2345); + q23456 = vaddq_s32(q2345, vmovl_s16(s6)); + q34567 = vaddq_s32(q4567, vmovl_s16(s3)); + q45678 = vaddq_s32(q4567, vmovl_s16(s8)); + + transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678); + store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567, + q45678); + dst1_32_ptr += 4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + + r23 = vaddq_s32(d2, d3); + r45 = vaddq_s32(d4, d5); + r67 = vaddq_s32(d6, d7); + r2345 = vaddq_s32(r23, r45); + r4567 = vaddq_s32(r45, r67); + r12345 = vaddq_s32(d1, r2345); + r23456 = vaddq_s32(r2345, d6); + r34567 = vaddq_s32(r4567, d3); + r45678 = vaddq_s32(r4567, d8); + + transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678); + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678); + dst2_ptr += 4; + d1 = d5; + d2 = d6; + d3 = d7; + d4 = d8; + w -= 4; + } while (w > 0); + h -= 8; + count++; + } while (h > 0); + } +} + +static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int r, + const int s, const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; + + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + dst_A16 = A16 + (count << 2) * buf_stride; + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + w = width; + do { + load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3); + load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s16_4 = s16_0; + s16_5 = s16_1; + s16_6 = s16_2; + s16_7 = s16_3; + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int bit_depth, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint16x8_t s16_0, s16_1, s16_2, s16_3; + uint16x8_t s16_4, s16_5, s16_6, s16_7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec); + + s16_4 = vrshlq_u16(s16_0, bd_min_2_vec); + s16_5 = vrshlq_u16(s16_1, bd_min_2_vec); + s16_6 = vrshlq_u16(s16_2, bd_min_2_vec); + s16_7 = vrshlq_u16(s16_3, bd_min_2_vec); + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vreinterpretq_u32_s32(sr0); + s1 = vreinterpretq_u32_s32(sr1); + s2 = vreinterpretq_u32_s32(sr2); + s3 = vreinterpretq_u32_s32(sr3); + s4 = vreinterpretq_u32_s32(sr4); + s5 = vreinterpretq_u32_s32(sr5); + s6 = vreinterpretq_u32_s32(sr6); + s7 = vreinterpretq_u32_s32(sr7); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int bit_depth, const int r, + const int s, const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, + int32_t *dst2, const int dst_stride, const int width, + const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *src_ptr; + int32_t *dst2_ptr; + uint16_t *dst1_ptr; + int h, w, count = 0; + + w = width; + { + int16x8_t s1, s2, s3, s4, s5, s6, s7, s8; + int16x8_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r345, r456, r567, r78, r678; + int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high; + int32x4_t r2, r3, r5, r6, r7, r8; + int16x8_t q678, q78; + + do { + dst1_ptr = dst1 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + h = height; + + load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + src_ptr += 4 * src_stride; + + q23 = vaddq_s16(s2, s3); + q234 = vaddq_s16(q23, s4); + q34 = vaddq_s16(s3, s4); + dst1_ptr += (dst_stride << 1); + + r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2)); + r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3)); + r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_low = vaddq_s32(r23, r4_low); + r34_low = vaddq_s32(r3, r4_low); + + r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2)); + r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3)); + r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_high = vaddq_s32(r23, r4_high); + r34_high = vaddq_s32(r3, r4_high); + + dst2_ptr += (dst_stride << 1); + + do { + load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + src_ptr += 4 * src_stride; + + q345 = vaddq_s16(s5, q34); + q56 = vaddq_s16(s5, s6); + q456 = vaddq_s16(s4, q56); + q567 = vaddq_s16(s7, q56); + q78 = vaddq_s16(s7, s8); + q678 = vaddq_s16(s6, q78); + + store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += (dst_stride << 2); + + s4 = s8; + q34 = q78; + q234 = q678; + + r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5)); + r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6)); + r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7)); + r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8)); + + r345 = vaddq_s32(r5, r34_low); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_low, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567); + + r4_low = r8; + r34_low = r78; + r234_low = r678; + + r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5)); + r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6)); + r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7)); + r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8)); + + r345 = vaddq_s32(r5, r34_high); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_high, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567); + dst2_ptr += (dst_stride << 2); + + r4_high = r8; + r34_high = r78; + r234_high = r678; + + h -= 4; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst1, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst1, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } + } + + { + int16x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int16x4_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678; + int32x4_t r1, r2, r3, r4, r5, r6, r7, r8; + int16x4_t q678, q78; + + int32_t *src2_ptr; + uint16_t *src1_ptr; + count = 0; + h = height; + w = width; + do { + dst1_ptr = dst1 + (count << 2) * dst_stride; + dst2_ptr = dst2 + (count << 2) * dst_stride; + src1_ptr = dst1 + (count << 2) * dst_stride; + src2_ptr = dst2 + (count << 2) * dst_stride; + w = width; + + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4); + transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4); + load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4); + transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4); + src1_ptr += 4; + src2_ptr += 4; + + q23 = vadd_s16(d2, d3); + q234 = vadd_s16(q23, d4); + q34 = vadd_s16(d3, d4); + dst1_ptr += 2; + r23 = vaddq_s32(r2, r3); + r234 = vaddq_s32(r23, r4); + r34 = vaddq_s32(r3, r4); + dst2_ptr += 2; + + do { + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8); + transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8); + load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8); + transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8); + src1_ptr += 4; + src2_ptr += 4; + + q345 = vadd_s16(d5, q34); + q56 = vadd_s16(d5, d6); + q456 = vadd_s16(d4, q56); + q567 = vadd_s16(d7, q56); + q78 = vadd_s16(d7, d8); + q678 = vadd_s16(d6, q78); + transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567); + store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += 4; + + d4 = d8; + q34 = q78; + q234 = q678; + + r345 = vaddq_s32(r5, r34); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567); + store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567); + dst2_ptr += 4; + + r4 = r8; + r34 = r78; + r234 = r678; + w -= 4; + } while (w > 0); + h -= 4; + count++; + } while (h > 0); + } +} + +static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + int32x4_t fours, threes, res; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x)))); + threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes); + return res; +} + +static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + uint16x8_t r0, r1; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xb = vaddq_u16(xb, x); + xt = vaddq_u16(xt, xr); + xl = vaddq_u16(xl, xb); + xl = vaddq_u16(xl, xt); + + r0 = vshlq_n_u16(xl, 2); + + xbl = vaddq_u16(xbl, xbr); + xtl = vaddq_u16(xtl, xtr); + xtl = vaddq_u16(xtl, xbl); + + r1 = vshlq_n_u16(xtl, 2); + r1 = vsubq_u16(r1, xtl); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1)))); +} + +static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xbr, xb, xbl; + int32x4_t fives, sixes, fives_plus_sixes; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + sixes = vaddq_s32(xt, xb); + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xbr = vaddq_u16(xbr, xbl); + xtr = vaddq_u16(xtr, xtl); + xbr = vaddq_u16(xbr, xtr); + xtl = vshlq_n_u16(xbr, 2); + xbr = vaddq_u16(xtl, xbr); + + xb = vaddq_u16(xb, xt); + xb0 = vshlq_n_u16(xb, 1); + xb = vshlq_n_u16(xb, 2); + xb = vaddq_u16(xb, xb0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb)))); +} + +static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) { + int32x4_t xl, x, xr; + int32x4_t fives, sixes, fives_plus_sixes; + + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + fives = vaddq_s32(xl, xr); + sixes = x; + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, + int32x4_t *a1) { + uint16x8_t xl, x, xr; + uint16x8_t x0; + + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xl = vaddq_u16(xl, xr); + x0 = vshlq_n_u16(xl, 2); + xl = vaddq_u16(xl, x0); + + x0 = vshlq_n_u16(x, 1); + x = vshlq_n_u16(x, 2); + x = vaddq_u16(x, x0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); +} + +static void final_filter_fast_internal(uint16_t *A, int32_t *B, + const int buf_stride, int16_t *src, + const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + + A_tmp = A; + B_tmp = B; + src_ptr = src; + dst_ptr = dst; + h = height; + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + if (!(count & 1)) { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride); + b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } else { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_odd_row(B_tmp); + b_res1 = cross_sum_fast_odd_row(B_tmp + 4); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } + count++; + h -= 1; + } while (h > 0); +} + +void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, + int16_t *src, const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + h = height; + + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + do { + s0 = vld1q_s16(src_ptr); + cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_inp_s32(B_tmp, buf_stride); + b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + count++; + h -= 1; + } while (h > 0); +} + +static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, + int height, int dgd_stride, + int32_t *dst, int dst_stride, + int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + const int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + int32_t *sum_buf = B_; + uint16_t *tmp16_buf = A16_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 0); + assert(r == 2); + + // input(dgd16) is 16bit. + // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is + // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit + // buffer(square_sum_buf). + boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride, + width_ext, height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + +#if CONFIG_AV1_HIGHBITDEPTH + if (bit_depth > 8) { + calc_ab_fast_internal_hbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, + bit_depth, r, params->s[radius_idx], 2); + } else { + calc_ab_fast_internal_lbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, + params->s[radius_idx], 2); + } +#else + (void)bit_depth; + calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1), + (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, + width + 2, height + 2, r, params->s[radius_idx], 2); +#endif + final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, + dgd_stride, dst, dst_stride, width, height); +} + +static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + uint16_t B16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + uint16_t *sum_buf = B16_; + uint16_t *A16 = A16_; + int32_t *B = B_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 1); + assert(r == 1); + + // input(dgd16) is 16bit. + // sum of pixels output will be in 16bit(sum_buf). + // sum of squares output is kept in 32bit buffer(square_sum_buf). + boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext, + height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + +#if CONFIG_AV1_HIGHBITDEPTH + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + if (bit_depth > 8) { + calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, bit_depth, r, params->s[radius_idx], 1); + } else { + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, r, params->s[radius_idx], 1); + } +#else + (void)bit_depth; + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, height + 2, + r, params->s[radius_idx], 1); +#endif + final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, + dst_stride, width, height); +} + +static INLINE void src_convert_u8_to_u16(const uint8_t *src, + const int src_stride, uint16_t *dst, + const int dst_stride, const int width, + const int height) { + const uint8_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + + uint8x8_t t1, t2, t3, t4; + uint16x8_t s1, s2, s3, s4; + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + if (w >= 7) { + do { + load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + s1 = vmovl_u8(t1); + s2 = vmovl_u8(t2); + s3 = vmovl_u8(t3); + s4 = vmovl_u8(t4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + } + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + for (int x = 0; x < h; x++) { + for (int y = 0; y < width; y++) { + dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; + } + } + + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, + uint16_t *dst, const int dst_stride, + int width, int height) { + const uint16_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + uint16x8_t s1, s2, s3, s4; + + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + do { + load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + + for (int x = 0; x < h; x++) { + memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), + sizeof(uint16_t) * width); + } + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, + int stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif + + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, + flt_stride, bit_depth, sgr_params_idx, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, + bit_depth, sgr_params_idx, 1); + return 0; +} + +int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + + assert(!(params->r[0] == 0 && params->r[1] == 0)); + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, + bit_depth, eps, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, + bit_depth, eps, 1); + + av1_decode_xq(xqd, xq, params); + + { + int16_t *src_ptr; + uint8_t *dst_ptr; +#if CONFIG_AV1_HIGHBITDEPTH + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8); + uint16_t *dst16_ptr; +#endif + int16x4_t d0, d4; + int16x8_t r0, s0; + uint16x8_t r4; + int32x4_t u0, u4, v0, v4, f00, f10; + uint8x8_t t0; + int count = 0, w = width, h = height, rc = 0; + + const int32x4_t xq0_vec = vdupq_n_s32(xq[0]); + const int32x4_t xq1_vec = vdupq_n_s32(xq[1]); + const int16x8_t zero = vdupq_n_s16(0); + const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1); + src_ptr = (int16_t *)dgd16; + do { + w = width; + count = 0; + dst_ptr = dst8 + rc * dst_stride; +#if CONFIG_AV1_HIGHBITDEPTH + dst16_ptr = dst16 + rc * dst_stride; +#endif + do { + s0 = vld1q_s16(src_ptr + count); + + u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS); + u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS); + + v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS); + v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + f00 = vld1q_s32(flt0 + count); + f10 = vld1q_s32(flt0 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq0_vec, f00); + v4 = vmlaq_s32(v4, xq0_vec, f10); + } + + if (params->r[1] > 0) { + f00 = vld1q_s32(flt1 + count); + f10 = vld1q_s32(flt1 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq1_vec, f00); + v4 = vmlaq_s32(v4, xq1_vec, f10); + } + + d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + r0 = vcombine_s16(d0, d4); + + r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + r4 = vminq_u16(r4, max); + vst1q_u16(dst16_ptr, r4); + dst16_ptr += 8; + } else { + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); + dst_ptr += 8; + } +#else + (void)max; + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); + dst_ptr += 8; +#endif + w -= 8; + count += 8; + } while (w > 0); + + src_ptr += dgd16_stride; + flt1 += width; + flt0 += width; + rc++; + h--; + } while (h > 0); + } + return 0; +} diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c new file mode 100644 index 0000000000..4723154398 --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "warp_plane_neon.h" + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f[0], in16_lo); + int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f[0], in16_lo); + int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3)); + int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4)); + int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5)); + int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6)); + int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6), + vpaddlq_s16(m7) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f_s16, in16_lo); + int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f_s16, in16_lo); + int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3)); + int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4)); + int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5)); + int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6)); + int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6), + vpaddlq_s16(m7) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy) { + int16x4_t s0 = vget_low_s16(src[0]); + int16x4_t s1 = vget_low_s16(src[1]); + int16x4_t s2 = vget_low_s16(src[2]); + int16x4_t s3 = vget_low_s16(src[3]); + int16x4_t s4 = vget_low_s16(src[4]); + int16x4_t s5 = vget_low_s16(src[5]); + int16x4_t s6 = vget_low_s16(src[6]); + int16x4_t s7 = vget_low_s16(src[7]); + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); + + *res = m0123; +} + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), + vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), + vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + + *res = horizontal_add_4d_s32x4(m0123_pairs); +} + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); + + *res_low = m0123; + *res_high = m4567; +} + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); + m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); + int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); + m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); + int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); + m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); + int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); + m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + int32x4_t m4567_pairs[] = { m4, m5, m6, m7 }; + + *res_low = horizontal_add_4d_s32x4(m0123_pairs); + *res_high = horizontal_add_4d_s32x4(m4567_pairs); +} + +void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, conv_params, alpha, beta, gamma, delta); +} diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.h b/third_party/aom/av1/common/arm/warp_plane_neon.h new file mode 100644 index 0000000000..5afd72f4ab --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ +#define AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha); + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha); + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx); + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx); + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy); + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma); + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy); + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma); + +static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { + out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> + WARPEDDIFF_PREC_BITS))); +} + +static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) { + out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >> + WARPEDDIFF_PREC_BITS))); +} + +static INLINE int clamp_iy(int iy, int height) { + return clamp(iy, 0, height - 1); +} + +static INLINE void warp_affine_horizontal(const uint8_t *ref, int width, + int height, int stride, int p_width, + int p_height, int16_t alpha, + int16_t beta, const int64_t x4, + const int64_t y4, const int i, + int16x8_t tmp[]) { + const int bd = 8; + const int reduce_bits_horiz = ROUND0_BITS; + const int height_limit = AOMMIN(8, p_height - i) + 7; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + if (ix4 <= -7) { + for (int k = 0; k < height_limit; ++k) { + int iy = clamp_iy(iy4 + k - 7, height); + int16_t dup_val = + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } else if (ix4 >= width + 6) { + for (int k = 0; k < height_limit; ++k) { + int iy = clamp_iy(iy4 + k - 7, height); + int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } + + static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint8x16_t indx = vld1q_u8(kIotaArr); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + \ + if (out_of_boundary_left >= 0) { \ + int limit = out_of_boundary_left + 1; \ + uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcleq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + if (out_of_boundary_right >= 0) { \ + int limit = 15 - (out_of_boundary_right + 1); \ + uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcgeq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), + alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), + alpha); + } + } + } +} + +static INLINE void warp_affine_vertical( + uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound, + uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg, + int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j, + int16x8_t tmp[], const int fwd, const int bwd) { + const int bd = 8; + const int reduce_bits_horiz = ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + int add_const_vert; + if (is_compound) { + add_const_vert = + (1 << offset_bits_vert) + (1 << (COMPOUND_ROUND1_BITS - 1)); + } else { + add_const_vert = + (1 << offset_bits_vert) + (1 << (2 * FILTER_BITS - ROUND0_BITS - 1)); + } + const int sub_constant = (1 << (bd - 1)) + (1 << bd); + + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int res_sub_const = + (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + if (p_width > 4) { + for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + const int16x8_t *v_src = tmp + (k + 4); + + int32x4_t res_lo, res_hi; + if (gamma == 0) { + vertical_filter_8x1_f1(v_src, &res_lo, &res_hi, sy); + } else { + vertical_filter_8x1_f8(v_src, &res_lo, &res_hi, sy, gamma); + } + + res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert)); + res_hi = vaddq_s32(res_hi, vdupq_n_s32(add_const_vert)); + + if (is_compound) { + uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j]; + int16x8_t res_s16 = + vcombine_s16(vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS), + vshrn_n_s32(res_hi, COMPOUND_ROUND1_BITS)); + if (do_average) { + int16x8_t tmp16 = vreinterpretq_s16_u16(vld1q_u16(p)); + if (use_dist_wtd_comp_avg) { + int32x4_t tmp32_lo = vmull_n_s16(vget_low_s16(tmp16), fwd); + int32x4_t tmp32_hi = vmull_n_s16(vget_high_s16(tmp16), fwd); + tmp32_lo = vmlal_n_s16(tmp32_lo, vget_low_s16(res_s16), bwd); + tmp32_hi = vmlal_n_s16(tmp32_hi, vget_high_s16(res_s16), bwd); + tmp16 = vcombine_s16(vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS), + vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS)); + } else { + tmp16 = vhaddq_s16(tmp16, res_s16); + } + int16x8_t res = vaddq_s16(tmp16, vdupq_n_s16(res_sub_const)); + uint8x8_t res8 = vqshrun_n_s16( + res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + vst1_u8(&pred[(i + k + 4) * p_stride + j], res8); + } else { + vst1q_u16(p, vreinterpretq_u16_s16(res_s16)); + } + } else { + int16x8_t res16 = + vcombine_s16(vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS), + vshrn_n_s32(res_hi, 2 * FILTER_BITS - ROUND0_BITS)); + res16 = vsubq_s16(res16, vdupq_n_s16(sub_constant)); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + vst1_u8(p, vqmovun_s16(res16)); + } + } + } else { + // p_width == 4 + for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + const int16x8_t *v_src = tmp + (k + 4); + + int32x4_t res_lo; + if (gamma == 0) { + vertical_filter_4x1_f1(v_src, &res_lo, sy); + } else { + vertical_filter_4x1_f4(v_src, &res_lo, sy, gamma); + } + + res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert)); + + if (is_compound) { + uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j]; + + int16x4_t res_lo_s16 = vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS); + if (do_average) { + uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j]; + int16x4_t tmp16_lo = vreinterpret_s16_u16(vld1_u16(p)); + if (use_dist_wtd_comp_avg) { + int32x4_t tmp32_lo = vmull_n_s16(tmp16_lo, fwd); + tmp32_lo = vmlal_n_s16(tmp32_lo, res_lo_s16, bwd); + tmp16_lo = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS); + } else { + tmp16_lo = vhadd_s16(tmp16_lo, res_lo_s16); + } + int16x4_t res = vadd_s16(tmp16_lo, vdup_n_s16(res_sub_const)); + uint8x8_t res8 = vqshrun_n_s16( + vcombine_s16(res, vdup_n_s16(0)), + 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res8), 0); + } else { + uint16x4_t res_u16_low = vreinterpret_u16_s16(res_lo_s16); + vst1_u16(p, res_u16_low); + } + } else { + int16x4_t res16 = vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS); + res16 = vsub_s16(res16, vdup_n_s16(sub_constant)); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + uint8x8_t val = vqmovun_s16(vcombine_s16(res16, vdup_n_s16(0))); + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0); + } + } + } +} + +static INLINE void av1_warp_affine_common( + const int32_t *mat, const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const int is_compound = conv_params->is_compound; + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + assert(IMPLIES(is_compound, dst != NULL)); + assert(IMPLIES(do_average, is_compound)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int16x8_t tmp[15]; + warp_affine_horizontal(ref, width, height, stride, p_width, p_height, + alpha, beta, x4, y4, i, tmp); + warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, + dst_stride, do_average, use_dist_wtd_comp_avg, gamma, + delta, y4, i, j, tmp, w0, w1); + } + } +} + +#endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c new file mode 100644 index 0000000000..39e3ad99f4 --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "warp_plane_neon.h" + +DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5])); + int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4)); + uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5)); + uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6)); + uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8); + int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + int32x4_t tmp_res_high = vpaddq_s32(m45, m67); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + uint8x16_t in_89ab = vqtbl1q_u8(in, perm2); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0); + m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + int32x4_t tmp_res_high = m4567; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy) { + int16x4_t s0 = vget_low_s16(src[0]); + int16x4_t s1 = vget_low_s16(src[1]); + int16x4_t s2 = vget_low_s16(src[2]); + int16x4_t s3 = vget_low_s16(src[3]); + int16x4_t s4 = vget_low_s16(src[4]); + int16x4_t s5 = vget_low_s16(src[5]); + int16x4_t s6 = vget_low_s16(src[6]); + int16x4_t s7 = vget_low_s16(src[7]); + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); + + *res = m0123; +} + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), + vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), + vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + + *res = horizontal_add_4d_s32x4(m0123_pairs); +} + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); + + *res_low = m0123; + *res_high = m4567; +} + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); + m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); + int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); + m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); + int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); + m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); + int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); + m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + int32x4_t m4567_pairs[] = { m4, m5, m6, m7 }; + + *res_low = horizontal_add_4d_s32x4(m0123_pairs); + *res_high = horizontal_add_4d_s32x4(m4567_pairs); +} + +void av1_warp_affine_neon_i8mm(const int32_t *mat, const uint8_t *ref, + int width, int height, int stride, uint8_t *pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params, + int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, conv_params, alpha, beta, gamma, delta); +} diff --git a/third_party/aom/av1/common/arm/warp_plane_sve.c b/third_party/aom/av1/common/arm/warp_plane_sve.c new file mode 100644 index 0000000000..8a4bf5747b --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_sve.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/arm/dot_sve.h" +#include "warp_plane_neon.h" + +DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5])); + int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4)); + uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5)); + uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6)); + uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8); + int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + int32x4_t tmp_res_high = vpaddq_s32(m45, m67); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + uint8x16_t in_89ab = vqtbl1q_u8(in, perm2); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0); + m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + int32x4_t tmp_res_high = m4567; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy) { + int16x4_t s0 = vget_low_s16(src[0]); + int16x4_t s1 = vget_low_s16(src[1]); + int16x4_t s2 = vget_low_s16(src[2]); + int16x4_t s3 = vget_low_s16(src[3]); + int16x4_t s4 = vget_low_s16(src[4]); + int16x4_t s5 = vget_low_s16(src[5]); + int16x4_t s6 = vget_low_s16(src[6]); + int16x4_t s7 = vget_low_s16(src[7]); + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); + + *res = m0123; +} + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), + vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), + vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); +} + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); + + *res_low = m0123; + *res_high = m4567; +} + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + *res_low = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + *res_high = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); +} + +void av1_warp_affine_sve(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, conv_params, alpha, beta, gamma, delta); +} diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c new file mode 100644 index 0000000000..6440c16adb --- /dev/null +++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/common.h" +#include "av1/common/restoration.h" + +static INLINE uint16x8_t wiener_convolve5_8_2d_h( + const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2, + const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter, + const int32x4_t round_vec, const uint16x8_t im_max_val) { + // Since the Wiener filter is symmetric about the middle tap (tap 2) add + // mirrored source elements before multiplying filter coefficients. + int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4)); + int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + + // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS), + vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS)); + + return vminq_u16(res, im_max_val); +} + +static INLINE void convolve_add_src_horiz_5tap_neon( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int32x4_t round_vec, const uint16x8_t im_max_val) { + do { + const uint8_t *s = src_ptr; + uint16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3, s4; + load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); + + uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter, + round_vec, im_max_val); + + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +} + +static INLINE uint16x8_t wiener_convolve7_8_2d_h( + const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2, + const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5, + const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec, + const uint16x8_t im_max_val) { + // Since the Wiener filter is symmetric about the middle tap (tap 3) add + // mirrored source elements before multiplying by filter coefficients. + int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6)); + int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS), + vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS)); + + return vminq_u16(res, im_max_val); +} + +static INLINE void convolve_add_src_horiz_7tap_neon( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int32x4_t round_vec, const uint16x8_t im_max_val) { + do { + const uint8_t *s = src_ptr; + uint16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6, + x_filter, round_vec, im_max_val); + + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +} + +static INLINE uint8x8_t wiener_convolve5_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, + const int32x4_t round_vec) { + // Since the Wiener filter is symmetric about the middle tap (tap 2) add + // mirrored source elements before multiplying by filter coefficients. + int16x8_t s04 = vaddq_s16(s0, s4); + int16x8_t s13 = vaddq_s16(s1, s3); + + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3); + + int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res_lo, res_hi)); +} + +static INLINE void convolve_add_src_vert_5tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, + const int32x4_t round_vec) { + do { + const int16_t *s = (int16_t *)src; + uint8_t *d = dst; + int height = h; + + while (height > 3) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + uint8x8_t d0 = + wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec); + uint8x8_t d1 = + wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec); + uint8x8_t d2 = + wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec); + uint8x8_t d3 = + wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } + + while (height-- != 0) { + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + + uint8x8_t d0 = + wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec); + + vst1_u8(d, d0); + + d += dst_stride; + s += src_stride; + } + + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE uint8x8_t wiener_convolve7_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) { + // Since the Wiener filter is symmetric about the middle tap (tap 3) add + // mirrored source elements before multiplying by filter coefficients. + int16x8_t s06 = vaddq_s16(s0, s6); + int16x8_t s15 = vaddq_s16(s1, s5); + int16x8_t s24 = vaddq_s16(s2, s4); + + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3); + + int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res_lo, res_hi)); +} + +static INLINE void convolve_add_src_vert_7tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, + const int32x4_t round_vec) { + do { + const int16_t *s = (int16_t *)src; + uint8_t *d = dst; + int height = h; + + while (height > 3) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; + load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9); + + uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6, + y_filter, round_vec); + uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7, + y_filter, round_vec); + uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8, + y_filter, round_vec); + uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9, + y_filter, round_vec); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } + + while (height-- != 0) { + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6, + y_filter, round_vec); + + vst1_u8(d, d0); + + d += dst_stride; + s += src_stride; + } + + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) { + assert(filter[7] == 0); + if (filter[0] == 0 && filter[6] == 0) { + return WIENER_WIN_REDUCED; + } + return WIENER_WIN; +} + +// Wiener filter 2D +// Apply horizontal filter and store in a temporary buffer. When applying +// vertical filter, overwrite the original pixel values. +void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *x_filter, int x_step_q4, + const int16_t *y_filter, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + (void)x_step_q4; + (void)y_step_q4; + (void)conv_params; + + assert(w % 8 == 0); + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(x_filter[7] == 0 && y_filter[7] == 0); + // For bd == 8, assert horizontal filtering output will not exceed 15-bit: + assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15); + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]); + + const int x_filter_taps = get_wiener_filter_taps(x_filter); + const int y_filter_taps = get_wiener_filter_taps(y_filter); + int16x4_t x_filter_s16 = vld1_s16(x_filter); + int16x4_t y_filter_s16 = vld1_s16(y_filter); + // Add 128 to tap 3. (Needed for rounding.) + x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48)); + y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48)); + + const int im_stride = MAX_SB_SIZE; + const int im_h = h + y_filter_taps - 1; + const int horiz_offset = x_filter_taps / 2; + const int vert_offset = (y_filter_taps / 2) * (int)src_stride; + + const int bd = 8; + const uint16x8_t im_max_val = + vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1); + const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); + + const int32x4_t vert_round_vec = + vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) - + (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1))); + + if (x_filter_taps == WIENER_WIN_REDUCED) { + convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter_s16, horiz_round_vec, im_max_val); + } else { + convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter_s16, horiz_round_vec, im_max_val); + } + + if (y_filter_taps == WIENER_WIN_REDUCED) { + convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_s16, vert_round_vec); + } else { + convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_s16, vert_round_vec); + } +} diff --git a/third_party/aom/av1/common/av1_common_int.h b/third_party/aom/av1/common/av1_common_int.h new file mode 100644 index 0000000000..4c0cb99d2b --- /dev/null +++ b/third_party/aom/av1/common/av1_common_int.h @@ -0,0 +1,1882 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_ +#define AOM_AV1_COMMON_AV1_COMMON_INT_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/internal/aom_codec_internal.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_util/aom_thread.h" +#include "av1/common/alloccommon.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/enums.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/mv.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" +#include "av1/common/tile_common.h" +#include "av1/common/timing.h" +#include "aom_dsp/grain_params.h" +#include "aom_dsp/grain_table.h" +#include "aom_dsp/odintrin.h" +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) && defined(__has_warning) +#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT +#endif +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT +#endif + +#ifndef AOM_FALLTHROUGH_INTENDED +#define AOM_FALLTHROUGH_INTENDED \ + do { \ + } while (0) +#endif + +#define CDEF_MAX_STRENGTHS 16 + +/* Constant values while waiting for the sequence header */ +#define FRAME_ID_LENGTH 15 +#define DELTA_FRAME_ID_LENGTH 14 + +#define FRAME_CONTEXTS (FRAME_BUFFERS + 1) +// Extra frame context which is always kept at default values +#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1) +#define PRIMARY_REF_BITS 3 +#define PRIMARY_REF_NONE 7 + +#define NUM_PING_PONG_BUFFERS 2 + +#define MAX_NUM_TEMPORAL_LAYERS 8 +#define MAX_NUM_SPATIAL_LAYERS 4 +/* clang-format off */ +// clang-format seems to think this is a pointer dereference and not a +// multiplication. +#define MAX_NUM_OPERATING_POINTS \ + (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS) +/* clang-format on */ + +// TODO(jingning): Turning this on to set up transform coefficient +// processing timer. +#define TXCOEFF_TIMER 0 +#define TXCOEFF_COST_TIMER 0 + +/*!\cond */ + +enum { + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} UENUM1BYTE(REFERENCE_MODE); + +enum { + /** + * Frame context updates are disabled + */ + REFRESH_FRAME_CONTEXT_DISABLED, + /** + * Update frame context to values resulting from backward probability + * updates based on entropy/counts in the decoded frame + */ + REFRESH_FRAME_CONTEXT_BACKWARD, +} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE); + +#define MFMV_STACK_SIZE 3 +typedef struct { + int_mv mfmv0; + uint8_t ref_frame_offset; +} TPL_MV_REF; + +typedef struct { + int_mv mv; + MV_REFERENCE_FRAME ref_frame; +} MV_REF; + +typedef struct RefCntBuffer { + // For a RefCntBuffer, the following are reference-holding variables: + // - cm->ref_frame_map[] + // - cm->cur_frame + // - cm->scaled_ref_buf[] (encoder only) + // - pbi->output_frame_index[] (decoder only) + // With that definition, 'ref_count' is the number of reference-holding + // variables that are currently referencing this buffer. + // For example: + // - suppose this buffer is at index 'k' in the buffer pool, and + // - Total 'n' of the variables / array elements above have value 'k' (that + // is, they are pointing to buffer at index 'k'). + // Then, pool->frame_bufs[k].ref_count = n. + int ref_count; + + unsigned int order_hint; + unsigned int ref_order_hints[INTER_REFS_PER_FRAME]; + + // These variables are used only in encoder and compare the absolute + // display order hint to compute the relative distance and overcome + // the limitation of get_relative_dist() which returns incorrect + // distance when a very old frame is used as a reference. + unsigned int display_order_hint; + unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME]; + // Frame's level within the hierarchical structure. + unsigned int pyramid_level; + MV_REF *mvs; + uint8_t *seg_map; + struct segmentation seg; + int mi_rows; + int mi_cols; + // Width and height give the size of the buffer (before any upscaling, unlike + // the sizes that can be derived from the buf structure) + int width; + int height; + WarpedMotionParams global_motion[REF_FRAMES]; + int showable_frame; // frame can be used as show existing frame in future + uint8_t film_grain_params_present; + aom_film_grain_t film_grain_params; + aom_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; + int temporal_id; // Temporal layer ID of the frame + int spatial_id; // Spatial layer ID of the frame + FRAME_TYPE frame_type; + + // This is only used in the encoder but needs to be indexed per ref frame + // so it's extremely convenient to keep it here. + int interp_filter_selected[SWITCHABLE]; + + // Inter frame reference frame delta for loop filter + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT frame_context; +} RefCntBuffer; + +typedef struct BufferPool { +// Protect BufferPool from being accessed by several FrameWorkers at +// the same time during frame parallel decode. +// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. +// TODO(wtc): Remove this. See +// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630. +#if CONFIG_MULTITHREAD + pthread_mutex_t pool_mutex; +#endif + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + + aom_get_frame_buffer_cb_fn_t get_fb_cb; + aom_release_frame_buffer_cb_fn_t release_fb_cb; + + RefCntBuffer *frame_bufs; + uint8_t num_frame_bufs; + + // Frame buffers allocated internally by the codec. + InternalFrameBufferList int_frame_buffers; +} BufferPool; + +/*!\endcond */ + +/*!\brief Parameters related to CDEF */ +typedef struct { + //! CDEF column line buffer + uint16_t *colbuf[MAX_MB_PLANE]; + //! CDEF top & bottom line buffer + uint16_t *linebuf[MAX_MB_PLANE]; + //! CDEF intermediate buffer + uint16_t *srcbuf; + //! CDEF column line buffer sizes + size_t allocated_colbuf_size[MAX_MB_PLANE]; + //! CDEF top and bottom line buffer sizes + size_t allocated_linebuf_size[MAX_MB_PLANE]; + //! CDEF intermediate buffer size + size_t allocated_srcbuf_size; + //! CDEF damping factor + int cdef_damping; + //! Number of CDEF strength values + int nb_cdef_strengths; + //! CDEF strength values for luma + int cdef_strengths[CDEF_MAX_STRENGTHS]; + //! CDEF strength values for chroma + int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; + //! Number of CDEF strength values in bits + int cdef_bits; + //! Number of rows in the frame in 4 pixel + int allocated_mi_rows; + //! Number of CDEF workers + int allocated_num_workers; +} CdefInfo; + +/*!\cond */ + +typedef struct { + int delta_q_present_flag; + // Resolution of delta quant + int delta_q_res; + int delta_lf_present_flag; + // Resolution of delta lf level + int delta_lf_res; + // This is a flag for number of deltas of loop filter level + // 0: use 1 delta, for y_vertical, y_horizontal, u, and v + // 1: use separate deltas for each filter level + int delta_lf_multi; +} DeltaQInfo; + +typedef struct { + int enable_order_hint; // 0 - disable order hint, and related tools + int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs, + // frame_sign_bias + // if 0, enable_dist_wtd_comp and + // enable_ref_frame_mvs must be set as 0. + int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes + // 1 - enable it + int enable_ref_frame_mvs; // 0 - disable ref frame mvs + // 1 - enable it +} OrderHintInfo; + +// Sequence header structure. +// Note: All syntax elements of sequence_header_obu that need to be +// bit-identical across multiple sequence headers must be part of this struct, +// so that consistency is checked by are_seq_headers_consistent() function. +// One exception is the last member 'op_params' that is ignored by +// are_seq_headers_consistent() function. +typedef struct SequenceHeader { + int num_bits_width; + int num_bits_height; + int max_frame_width; + int max_frame_height; + // Whether current and reference frame IDs are signaled in the bitstream. + // Frame id numbers are additional information that do not affect the + // decoding process, but provide decoders with a way of detecting missing + // reference frames so that appropriate action can be taken. + uint8_t frame_id_numbers_present_flag; + int frame_id_length; + int delta_frame_id_length; + BLOCK_SIZE sb_size; // Size of the superblock used for this frame + int mib_size; // Size of the superblock in units of MI blocks + int mib_size_log2; // Log 2 of above. + + OrderHintInfo order_hint_info; + + uint8_t force_screen_content_tools; // 0 - force off + // 1 - force on + // 2 - adaptive + uint8_t still_picture; // Video is a single frame still picture + uint8_t reduced_still_picture_hdr; // Use reduced header for still picture + uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel + // 1 - force to integer + // 2 - adaptive + uint8_t enable_filter_intra; // enables/disables filterintra + uint8_t enable_intra_edge_filter; // enables/disables edge upsampling + uint8_t enable_interintra_compound; // enables/disables interintra_compound + uint8_t enable_masked_compound; // enables/disables masked compound + uint8_t enable_dual_filter; // 0 - disable dual interpolation filter + // 1 - enable vert/horz filter selection + uint8_t enable_warped_motion; // 0 - disable warp for the sequence + // 1 - enable warp for the sequence + uint8_t enable_superres; // 0 - Disable superres for the sequence + // and no frame level superres flag + // 1 - Enable superres for the sequence + // enable per-frame superres flag + uint8_t enable_cdef; // To turn on/off CDEF + uint8_t enable_restoration; // To turn on/off loop restoration + BITSTREAM_PROFILE profile; + + // Color config. + aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, + // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. + uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers. + uint8_t monochrome; // Monochrome video + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + int color_range; + int subsampling_x; // Chroma subsampling for x + int subsampling_y; // Chroma subsampling for y + aom_chroma_sample_position_t chroma_sample_position; + uint8_t separate_uv_delta_q; + uint8_t film_grain_params_present; + + // Operating point info. + int operating_points_cnt_minus_1; + int operating_point_idc[MAX_NUM_OPERATING_POINTS]; + int timing_info_present; + aom_timing_info_t timing_info; + uint8_t decoder_model_info_present_flag; + aom_dec_model_info_t decoder_model_info; + uint8_t display_model_info_present_flag; + AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS]; + uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1. + + // IMPORTANT: the op_params member must be at the end of the struct so that + // are_seq_headers_consistent() can be implemented with a memcmp() call. + // TODO(urvang): We probably don't need the +1 here. + aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; +} SequenceHeader; + +typedef struct { + int skip_mode_allowed; + int skip_mode_flag; + int ref_frame_idx_0; + int ref_frame_idx_1; +} SkipModeInfo; + +typedef struct { + FRAME_TYPE frame_type; + REFERENCE_MODE reference_mode; + + unsigned int order_hint; + unsigned int display_order_hint; + // Frame's level within the hierarchical structure. + unsigned int pyramid_level; + unsigned int frame_number; + SkipModeInfo skip_mode_info; + int refresh_frame_flags; // Which ref frames are overwritten by this frame + int frame_refs_short_signaling; +} CurrentFrame; + +/*!\endcond */ + +/*! + * \brief Frame level features. + */ +typedef struct { + /*! + * If true, CDF update in the symbol encoding/decoding process is disabled. + */ + bool disable_cdf_update; + /*! + * If true, motion vectors are specified to eighth pel precision; and + * if false, motion vectors are specified to quarter pel precision. + */ + bool allow_high_precision_mv; + /*! + * If true, force integer motion vectors; if false, use the default. + */ + bool cur_frame_force_integer_mv; + /*! + * If true, palette tool and/or intra block copy tools may be used. + */ + bool allow_screen_content_tools; + bool allow_intrabc; /*!< If true, intra block copy tool may be used. */ + bool allow_warped_motion; /*!< If true, frame may use warped motion mode. */ + /*! + * If true, using previous frames' motion vectors for prediction is allowed. + */ + bool allow_ref_frame_mvs; + /*! + * If true, frame is fully lossless at coded resolution. + * */ + bool coded_lossless; + /*! + * If true, frame is fully lossless at upscaled resolution. + */ + bool all_lossless; + /*! + * If true, the frame is restricted to a reduced subset of the full set of + * transform types. + */ + bool reduced_tx_set_used; + /*! + * If true, error resilient mode is enabled. + * Note: Error resilient mode allows the syntax of a frame to be parsed + * independently of previously decoded frames. + */ + bool error_resilient_mode; + /*! + * If false, only MOTION_MODE that may be used is SIMPLE_TRANSLATION; + * if true, all MOTION_MODES may be used. + */ + bool switchable_motion_mode; + TX_MODE tx_mode; /*!< Transform mode at frame level. */ + InterpFilter interp_filter; /*!< Interpolation filter at frame level. */ + /*! + * The reference frame that contains the CDF values and other state that + * should be loaded at the start of the frame. + */ + int primary_ref_frame; + /*! + * Byte alignment of the planes in the reference buffers. + */ + int byte_alignment; + /*! + * Flag signaling how frame contexts should be updated at the end of + * a frame decode. + */ + REFRESH_FRAME_CONTEXT_MODE refresh_frame_context; +} FeatureFlags; + +/*! + * \brief Params related to tiles. + */ +typedef struct CommonTileParams { + int cols; /*!< number of tile columns that frame is divided into */ + int rows; /*!< number of tile rows that frame is divided into */ + int max_width_sb; /*!< maximum tile width in superblock units. */ + int max_height_sb; /*!< maximum tile height in superblock units. */ + + /*! + * Min width of non-rightmost tile in MI units. Only valid if cols > 1. + */ + int min_inner_width; + + /*! + * If true, tiles are uniformly spaced with power-of-two number of rows and + * columns. + * If false, tiles have explicitly configured widths and heights. + */ + int uniform_spacing; + + /** + * \name Members only valid when uniform_spacing == 1 + */ + /**@{*/ + int log2_cols; /*!< log2 of 'cols'. */ + int log2_rows; /*!< log2 of 'rows'. */ + int width; /*!< tile width in MI units */ + int height; /*!< tile height in MI units */ + /**@}*/ + + /*! + * Min num of tile columns possible based on 'max_width_sb' and frame width. + */ + int min_log2_cols; + /*! + * Min num of tile rows possible based on 'max_height_sb' and frame height. + */ + int min_log2_rows; + /*! + * Max num of tile columns possible based on frame width. + */ + int max_log2_cols; + /*! + * Max num of tile rows possible based on frame height. + */ + int max_log2_rows; + /*! + * log2 of min number of tiles (same as min_log2_cols + min_log2_rows). + */ + int min_log2; + /*! + * col_start_sb[i] is the start position of tile column i in superblock units. + * valid for 0 <= i <= cols + */ + int col_start_sb[MAX_TILE_COLS + 1]; + /*! + * row_start_sb[i] is the start position of tile row i in superblock units. + * valid for 0 <= i <= rows + */ + int row_start_sb[MAX_TILE_ROWS + 1]; + /*! + * If true, we are using large scale tile mode. + */ + unsigned int large_scale; + /*! + * Only relevant when large_scale == 1. + * If true, the independent decoding of a single tile or a section of a frame + * is allowed. + */ + unsigned int single_tile_decoding; +} CommonTileParams; + +typedef struct CommonModeInfoParams CommonModeInfoParams; +/*! + * \brief Params related to MB_MODE_INFO arrays and related info. + */ +struct CommonModeInfoParams { + /*! + * Number of rows in the frame in 16 pixel units. + * This is computed from frame height aligned to a multiple of 8. + */ + int mb_rows; + /*! + * Number of cols in the frame in 16 pixel units. + * This is computed from frame width aligned to a multiple of 8. + */ + int mb_cols; + + /*! + * Total MBs = mb_rows * mb_cols. + */ + int MBs; + + /*! + * Number of rows in the frame in 4 pixel (MB_MODE_INFO) units. + * This is computed from frame height aligned to a multiple of 8. + */ + int mi_rows; + /*! + * Number of cols in the frame in 4 pixel (MB_MODE_INFO) units. + * This is computed from frame width aligned to a multiple of 8. + */ + int mi_cols; + + /*! + * An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block + * in the frame. + * Note: This array should be treated like a scratch memory, and should NOT be + * accessed directly, in most cases. Please use 'mi_grid_base' array instead. + */ + MB_MODE_INFO *mi_alloc; + /*! + * Number of allocated elements in 'mi_alloc'. + */ + int mi_alloc_size; + /*! + * Stride for 'mi_alloc' array. + */ + int mi_alloc_stride; + /*! + * The minimum block size that each element in 'mi_alloc' can correspond to. + * For decoder, this is always BLOCK_4X4. + * For encoder, this is BLOCK_8X8 for resolution >= 4k case or REALTIME mode + * case. Otherwise, this is BLOCK_4X4. + */ + BLOCK_SIZE mi_alloc_bsize; + + /*! + * Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'. + * It's possible that: + * - Multiple pointers in the grid point to the same element in 'mi_alloc' + * (for example, for all 4x4 blocks that belong to the same partition block). + * - Some pointers can be NULL (for example, for blocks outside visible area). + */ + MB_MODE_INFO **mi_grid_base; + /*! + * Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also). + */ + int mi_grid_size; + /*! + * Stride for 'mi_grid_base' (and 'tx_type_map' also). + */ + int mi_stride; + + /*! + * An array of tx types for each 4x4 block in the frame. + * Number of allocated elements is same as 'mi_grid_size', and stride is + * same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of + * 'mi_grid_base'. + */ + TX_TYPE *tx_type_map; + + /** + * \name Function pointers to allow separate logic for encoder and decoder. + */ + /**@{*/ + /*! + * Free the memory allocated to arrays in 'mi_params'. + * \param[in,out] mi_params object containing common mode info parameters + */ + void (*free_mi)(struct CommonModeInfoParams *mi_params); + /*! + * Initialize / reset appropriate arrays in 'mi_params'. + * \param[in,out] mi_params object containing common mode info parameters + */ + void (*setup_mi)(struct CommonModeInfoParams *mi_params); + /*! + * Allocate required memory for arrays in 'mi_params'. + * \param[in,out] mi_params object containing common mode info + * parameters + * \param width frame width + * \param height frame height + * \param min_partition_size minimum partition size allowed while + * encoding + */ + void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width, + int height, BLOCK_SIZE min_partition_size); + /**@}*/ +}; + +typedef struct CommonQuantParams CommonQuantParams; +/*! + * \brief Parameters related to quantization at the frame level. + */ +struct CommonQuantParams { + /*! + * Base qindex of the frame in the range 0 to 255. + */ + int base_qindex; + + /*! + * Delta of qindex (from base_qindex) for Y plane DC coefficient. + * Note: y_ac_delta_q is implicitly 0. + */ + int y_dc_delta_q; + + /*! + * Delta of qindex (from base_qindex) for U plane DC coefficients. + */ + int u_dc_delta_q; + /*! + * Delta of qindex (from base_qindex) for U plane AC coefficients. + */ + int v_dc_delta_q; + + /*! + * Delta of qindex (from base_qindex) for V plane DC coefficients. + * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. + */ + int u_ac_delta_q; + /*! + * Delta of qindex (from base_qindex) for V plane AC coefficients. + * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. + */ + int v_ac_delta_q; + + /* + * Note: The qindex per superblock may have a delta from the qindex obtained + * at frame level from parameters above, based on 'cm->delta_q_info'. + */ + + /** + * \name True dequantizers. + * The dequantizers below are true dequantizers used only in the + * dequantization process. They have the same coefficient + * shift/scale as TX. + */ + /**@{*/ + int16_t y_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for Y plane */ + int16_t u_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for U plane */ + int16_t v_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for V plane */ + /**@}*/ + + /** + * \name Global quantization matrix tables. + */ + /**@{*/ + /*! + * Global dequantization matrix table. + */ + const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + /*! + * Global quantization matrix table. + */ + const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + /**@}*/ + + /** + * \name Local dequantization matrix tables for each frame. + */ + /**@{*/ + /*! + * Local dequant matrix for Y plane. + */ + const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + /*! + * Local dequant matrix for U plane. + */ + const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + /*! + * Local dequant matrix for V plane. + */ + const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + /**@}*/ + + /*! + * Flag indicating whether quantization matrices are being used: + * - If true, qm_level_y, qm_level_u and qm_level_v indicate the level + * indices to be used to access appropriate global quant matrix tables. + * - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'. + */ + bool using_qmatrix; + /** + * \name Valid only when using_qmatrix == true + * Indicate the level indices to be used to access appropriate global quant + * matrix tables. + */ + /**@{*/ + int qmatrix_level_y; /*!< Level index for Y plane */ + int qmatrix_level_u; /*!< Level index for U plane */ + int qmatrix_level_v; /*!< Level index for V plane */ + /**@}*/ +}; + +typedef struct CommonContexts CommonContexts; +/*! + * \brief Contexts used for transmitting various symbols in the bitstream. + */ +struct CommonContexts { + /*! + * Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type. + * partition[i][j] is the context for ith tile row, jth mi_col. + */ + PARTITION_CONTEXT **partition; + + /*! + * Context used to derive context for multiple symbols: + * - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit + * to transmit skip_txfm flag. + * - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit + * sign. + * entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col. + */ + ENTROPY_CONTEXT **entropy[MAX_MB_PLANE]; + + /*! + * Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to + * transmit 'is_split' flag to indicate if this transform block should be + * split into smaller sub-blocks. + * txfm[i][j] is the context for ith tile row, jth mi_col. + */ + TXFM_CONTEXT **txfm; + + /*! + * Dimensions that were used to allocate the arrays above. + * If these dimensions change, the arrays may have to be re-allocated. + */ + int num_planes; /*!< Corresponds to av1_num_planes(cm) */ + int num_tile_rows; /*!< Corresponds to cm->tiles.row */ + int num_mi_cols; /*!< Corresponds to cm->mi_params.mi_cols */ +}; + +/*! + * \brief Top level common structure used by both encoder and decoder. + */ +typedef struct AV1Common { + /*! + * Information about the current frame that is being coded. + */ + CurrentFrame current_frame; + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info *error; + + /*! + * AV1 allows two types of frame scaling operations: + * 1. Frame super-resolution: that allows coding a frame at lower resolution + * and after decoding the frame, normatively scales and restores the frame -- + * inside the coding loop. + * 2. Frame resize: that allows coding frame at lower/higher resolution, and + * then non-normatively upscale the frame at the time of rendering -- outside + * the coding loop. + * Hence, the need for 3 types of dimensions. + */ + + /** + * \name Coded frame dimensions. + */ + /**@{*/ + int width; /*!< Coded frame width */ + int height; /*!< Coded frame height */ + /**@}*/ + + /** + * \name Rendered frame dimensions. + * Dimensions after applying both super-resolution and resize to the coded + * frame. Different from coded dimensions if super-resolution and/or resize + * are being used for this frame. + */ + /**@{*/ + int render_width; /*!< Rendered frame width */ + int render_height; /*!< Rendered frame height */ + /**@}*/ + + /** + * \name Super-resolved frame dimensions. + * Frame dimensions after applying super-resolution to the coded frame (if + * present), but before applying resize. + * Larger than the coded dimensions if super-resolution is being used for + * this frame. + * Different from rendered dimensions if resize is being used for this frame. + */ + /**@{*/ + int superres_upscaled_width; /*!< Super-resolved frame width */ + int superres_upscaled_height; /*!< Super-resolved frame height */ + /**@}*/ + + /*! + * The denominator of the superres scale used by this frame. + * Note: The numerator is fixed to be SCALE_NUMERATOR. + */ + uint8_t superres_scale_denominator; + + /*! + * buffer_removal_times[op_num] specifies the frame removal time in units of + * DecCT clock ticks counted from the removal time of the last random access + * point for operating point op_num. + * TODO(urvang): We probably don't need the +1 here. + */ + uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1]; + /*! + * Presentation time of the frame in clock ticks DispCT counted from the + * removal time of the last random access point for the operating point that + * is being decoded. + */ + uint32_t frame_presentation_time; + + /*! + * Buffer where previous frame is stored. + */ + RefCntBuffer *prev_frame; + + /*! + * Buffer into which the current frame will be stored and other related info. + * TODO(hkuang): Combine this with cur_buf in macroblockd. + */ + RefCntBuffer *cur_frame; + + /*! + * For encoder, we have a two-level mapping from reference frame type to the + * corresponding buffer in the buffer pool: + * * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ... + * EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1) + * * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to + * the reference counted buffer structure RefCntBuffer, taken from the buffer + * pool cm->buffer_pool->frame_bufs. + * + * LAST_FRAME, ..., EXTREF_FRAME + * | | + * v v + * remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] + * | | + * v v + * ref_frame_map[], ..., ref_frame_map[] + * + * Note: INTRA_FRAME always refers to the current frame, so there's no need to + * have a remapped index for the same. + */ + int remapped_ref_idx[REF_FRAMES]; + + /*! + * Scale of the current frame with respect to itself. + * This is currently used for intra block copy, which behaves like an inter + * prediction mode, where the reference frame is the current frame itself. + */ + struct scale_factors sf_identity; + + /*! + * Scale factors of the reference frame with respect to the current frame. + * This is required for generating inter prediction and will be non-identity + * for a reference frame, if it has different dimensions than the coded + * dimensions of the current frame. + */ + struct scale_factors ref_scale_factors[REF_FRAMES]; + + /*! + * For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to + * the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + * For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps + * remapped reference index 'j' (that is, original reference type 'i') to + * a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + */ + RefCntBuffer *ref_frame_map[REF_FRAMES]; + + /*! + * If true, this frame is actually shown after decoding. + * If false, this frame is coded in the bitstream, but not shown. It is only + * used as a reference for other frames coded later. + */ + int show_frame; + + /*! + * If true, this frame can be used as a show-existing frame for other frames + * coded later. + * When 'show_frame' is true, this is always true for all non-keyframes. + * When 'show_frame' is false, this value is transmitted in the bitstream. + */ + int showable_frame; + + /*! + * If true, show an existing frame coded before, instead of actually coding a + * frame. The existing frame comes from one of the existing reference buffers, + * as signaled in the bitstream. + */ + int show_existing_frame; + + /*! + * Whether some features are allowed or not. + */ + FeatureFlags features; + + /*! + * Params related to MB_MODE_INFO arrays and related info. + */ + CommonModeInfoParams mi_params; + +#if CONFIG_ENTROPY_STATS + /*! + * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1). + */ + int coef_cdf_category; +#endif // CONFIG_ENTROPY_STATS + + /*! + * Quantization params. + */ + CommonQuantParams quant_params; + + /*! + * Segmentation info for current frame. + */ + struct segmentation seg; + + /*! + * Segmentation map for previous frame. + */ + uint8_t *last_frame_seg_map; + + /** + * \name Deblocking filter parameters. + */ + /**@{*/ + loop_filter_info_n lf_info; /*!< Loop filter info */ + struct loopfilter lf; /*!< Loop filter parameters */ + /**@}*/ + + /** + * \name Loop Restoration filter parameters. + */ + /**@{*/ + RestorationInfo rst_info[MAX_MB_PLANE]; /*!< Loop Restoration filter info */ + int32_t *rst_tmpbuf; /*!< Scratch buffer for self-guided restoration */ + RestorationLineBuffers *rlbs; /*!< Line buffers needed by loop restoration */ + YV12_BUFFER_CONFIG rst_frame; /*!< Stores the output of loop restoration */ + /**@}*/ + + /*! + * CDEF (Constrained Directional Enhancement Filter) parameters. + */ + CdefInfo cdef_info; + + /*! + * Parameters for film grain synthesis. + */ + aom_film_grain_t film_grain_params; + + /*! + * Parameters for delta quantization and delta loop filter level. + */ + DeltaQInfo delta_q_info; + + /*! + * Global motion parameters for each reference frame. + */ + WarpedMotionParams global_motion[REF_FRAMES]; + + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader *seq_params; + + /*! + * Current CDFs of all the symbols for the current frame. + */ + FRAME_CONTEXT *fc; + /*! + * Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE + * (e.g. for a keyframe). These default CDFs are defined by the bitstream and + * copied from default CDF tables for each symbol. + */ + FRAME_CONTEXT *default_frame_context; + + /*! + * Parameters related to tiling. + */ + CommonTileParams tiles; + + /*! + * External BufferPool passed from outside. + */ + BufferPool *buffer_pool; + + /*! + * Above context buffers and their sizes. + * Note: above contexts are allocated in this struct, as their size is + * dependent on frame width, while left contexts are declared and allocated in + * MACROBLOCKD struct, as they have a fixed size. + */ + CommonContexts above_contexts; + + /** + * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1 + */ + /**@{*/ + int current_frame_id; /*!< frame ID for the current frame. */ + int ref_frame_id[REF_FRAMES]; /*!< frame IDs for the reference frames. */ + /**@}*/ + + /*! + * Motion vectors provided by motion field estimation. + * tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where: + * mi_row = 2 * row, + * mi_col = 2 * col, and + * stride = cm->mi_params.mi_stride / 2 + */ + TPL_MV_REF *tpl_mvs; + /*! + * Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function. + */ + int tpl_mvs_mem_size; + /*! + * ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and + * current frame is positive; and 0 otherwise. + */ + int ref_frame_sign_bias[REF_FRAMES]; + /*! + * ref_frame_side[k] is 1 if relative distance between reference 'k' and + * current frame is positive, -1 if relative distance is 0; and 0 otherwise. + * TODO(jingning): This can be combined with sign_bias later. + */ + int8_t ref_frame_side[REF_FRAMES]; + + /*! + * Temporal layer ID of this frame + * (in the range 0 ... (number_temporal_layers - 1)). + */ + int temporal_layer_id; + + /*! + * Spatial layer ID of this frame + * (in the range 0 ... (number_spatial_layers - 1)). + */ + int spatial_layer_id; + +#if TXCOEFF_TIMER + int64_t cum_txcoeff_timer; + int64_t txcoeff_timer; + int txb_count; +#endif // TXCOEFF_TIMER + +#if TXCOEFF_COST_TIMER + int64_t cum_txcoeff_cost_timer; + int64_t txcoeff_cost_timer; + int64_t txcoeff_cost_count; +#endif // TXCOEFF_COST_TIMER +} AV1_COMMON; + +/*!\cond */ + +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +static void lock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static void unlock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) return NULL; + if (cm->ref_frame_map[index] == NULL) return NULL; + return &cm->ref_frame_map[index]->buf; +} + +static INLINE int get_free_fb(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + const int num_frame_bufs = cm->buffer_pool->num_frame_bufs; + for (i = 0; i < num_frame_bufs; ++i) + if (frame_bufs[i].ref_count == 0) break; + + if (i != num_frame_bufs) { + if (frame_bufs[i].buf.use_external_reference_buffers) { + // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the + // external reference buffers. Restore the buffer pointers to point to the + // internally allocated memory. + YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf; + ybf->y_buffer = ybf->store_buf_adr[0]; + ybf->u_buffer = ybf->store_buf_adr[1]; + ybf->v_buffer = ybf->store_buf_adr[2]; + ybf->use_external_reference_buffers = 0; + } + + frame_bufs[i].ref_count = 1; + } else { + // We should never run out of free buffers. If this assertion fails, there + // is a reference leak. + assert(0 && "Ran out of free frame buffers. Likely a reference leak."); + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + + unlock_buffer_pool(cm->buffer_pool); + return i; +} + +static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) { + // Release the previously-used frame-buffer + if (cm->cur_frame != NULL) { + --cm->cur_frame->ref_count; + cm->cur_frame = NULL; + } + + // Assign a new framebuffer + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) return NULL; + + cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx]; +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + aom_invalidate_pyramid(cm->cur_frame->buf.y_pyramid); + av1_invalidate_corner_list(cm->cur_frame->buf.corners); +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + av1_zero(cm->cur_frame->interp_filter_selected); + return cm->cur_frame; +} + +// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref +// counts accordingly. +static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr, + RefCntBuffer *rhs_ptr) { + RefCntBuffer *const old_ptr = *lhs_ptr; + if (old_ptr != NULL) { + assert(old_ptr->ref_count > 0); + // One less reference to the buffer at 'old_ptr', so decrease ref count. + --old_ptr->ref_count; + } + + *lhs_ptr = rhs_ptr; + // One more reference to the buffer at 'rhs_ptr', so increase ref count. + ++rhs_ptr->ref_count; +} + +static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) { + return cm->current_frame.frame_type == KEY_FRAME || + cm->current_frame.frame_type == INTRA_ONLY_FRAME; +} + +static INLINE int frame_is_sframe(const AV1_COMMON *cm) { + return cm->current_frame.frame_type == S_FRAME; +} + +// These functions take a reference frame label between LAST_FRAME and +// EXTREF_FRAME inclusive. Note that this is different to the indexing +// previously used by the frame_refs[] array. +static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME ref_frame) { + return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME) + ? cm->remapped_ref_idx[ref_frame - LAST_FRAME] + : INVALID_IDX; +} + +static INLINE RefCntBuffer *get_ref_frame_buf( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Both const and non-const versions of this function are provided so that it +// can be used with a const AV1_COMMON if needed. +static INLINE const struct scale_factors *get_ref_scale_factors_const( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE struct scale_factors *get_ref_scale_factors( + AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE RefCntBuffer *get_primary_ref_frame_buf( + const AV1_COMMON *const cm) { + const int primary_ref_frame = cm->features.primary_ref_frame; + if (primary_ref_frame == PRIMARY_REF_NONE) return NULL; + const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Returns 1 if this frame might allow mvs from some reference frame. +static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && + cm->seq_params->order_hint_info.enable_ref_frame_mvs && + cm->seq_params->order_hint_info.enable_order_hint && + !frame_is_intra_only(cm); +} + +// Returns 1 if this frame might use warped_motion +static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) && + cm->seq_params->enable_warped_motion; +} + +static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { + const int buf_rows = buf->mi_rows; + const int buf_cols = buf->mi_cols; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (buf->mvs == NULL || buf_rows != mi_params->mi_rows || + buf_cols != mi_params->mi_cols) { + aom_free(buf->mvs); + buf->mi_rows = mi_params->mi_rows; + buf->mi_cols = mi_params->mi_cols; + CHECK_MEM_ERROR(cm, buf->mvs, + (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) * + ((mi_params->mi_cols + 1) >> 1), + sizeof(*buf->mvs))); + aom_free(buf->seg_map); + CHECK_MEM_ERROR( + cm, buf->seg_map, + (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols, + sizeof(*buf->seg_map))); + } + + const int mem_size = + ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1); + + if (cm->tpl_mvs == NULL || cm->tpl_mvs_mem_size < mem_size) { + aom_free(cm->tpl_mvs); + CHECK_MEM_ERROR(cm, cm->tpl_mvs, + (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs))); + cm->tpl_mvs_mem_size = mem_size; + } +} + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); + +static INLINE int av1_num_planes(const AV1_COMMON *cm) { + return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE; +} + +static INLINE void av1_init_above_context(CommonContexts *above_contexts, + int num_planes, int tile_row, + MACROBLOCKD *xd) { + for (int i = 0; i < num_planes; ++i) { + xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row]; + } + xd->above_partition_context = above_contexts->partition[tile_row]; + xd->above_txfm_context = above_contexts->txfm[tile_row]; +} + +static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) { + const int num_planes = av1_num_planes(cm); + const CommonQuantParams *const quant_params = &cm->quant_params; + + for (int i = 0; i < num_planes; ++i) { + if (xd->plane[i].plane_type == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX, + sizeof(quant_params->y_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix, + sizeof(quant_params->y_iqmatrix)); + + } else { + if (i == AOM_PLANE_U) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX, + sizeof(quant_params->u_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix, + sizeof(quant_params->u_iqmatrix)); + } else { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX, + sizeof(quant_params->v_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix, + sizeof(quant_params->v_iqmatrix)); + } + } + } + xd->mi_stride = cm->mi_params.mi_stride; + xd->error_info = cm->error; + cfl_init(&xd->cfl, cm->seq_params); +} + +static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col, + const int num_planes) { + int i; + int row_offset = mi_row; + int col_offset = mi_col; + for (i = 0; i < num_planes; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + // Offset the buffer pointer + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + row_offset = mi_row - 1; + if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + col_offset = mi_col - 1; + int above_idx = col_offset; + int left_idx = row_offset & MAX_MIB_MASK; + pd->above_entropy_context = + &xd->above_entropy_context[i][above_idx >> pd->subsampling_x]; + pd->left_entropy_context = + &xd->left_entropy_context[i][left_idx >> pd->subsampling_y]; + } +} + +static INLINE int calc_mi_size(int len) { + // len is in mi units. Align to a multiple of SBs. + return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2); +} + +static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, + const int num_planes) { + int i; + for (i = 0; i < num_planes; i++) { + xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x; + xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y; + + xd->plane[i].width = AOMMAX(xd->plane[i].width, 4); + xd->plane[i].height = AOMMAX(xd->plane[i].height, 4); + } +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE)); + xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Are edges available for intra prediction? + xd->up_available = (mi_row > tile->mi_row_start); + + const int ss_x = xd->plane[1].subsampling_x; + const int ss_y = xd->plane[1].subsampling_y; + + xd->left_available = (mi_col > tile->mi_col_start); + xd->chroma_up_available = xd->up_available; + xd->chroma_left_available = xd->left_available; + if (ss_x && bw < mi_size_wide[BLOCK_8X8]) + xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start; + if (ss_y && bh < mi_size_high[BLOCK_8X8]) + xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start; + if (xd->up_available) { + xd->above_mbmi = xd->mi[-xd->mi_stride]; + } else { + xd->above_mbmi = NULL; + } + + if (xd->left_available) { + xd->left_mbmi = xd->mi[-1]; + } else { + xd->left_mbmi = NULL; + } + + const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !ss_x); + xd->is_chroma_ref = chroma_ref; + if (chroma_ref) { + // To help calculate the "above" and "left" chroma blocks, note that the + // current block may cover multiple luma blocks (e.g., if partitioned into + // 4x4 luma blocks). + // First, find the top-left-most luma block covered by this chroma block + MB_MODE_INFO **base_mi = + &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)]; + + // Then, we consider the luma region covered by the left or above 4x4 chroma + // prediction. We want to point to the chroma reference block in that + // region, which is the bottom-right-most mi unit. + // This leads to the following offsets: + MB_MODE_INFO *chroma_above_mi = + xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL; + xd->chroma_above_mbmi = chroma_above_mi; + + MB_MODE_INFO *chroma_left_mi = + xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL; + xd->chroma_left_mbmi = chroma_left_mi; + } + + xd->height = bh; + xd->width = bw; + + xd->is_last_vertical_rect = 0; + if (xd->width < xd->height) { + if (!((mi_col + xd->width) & (xd->height - 1))) { + xd->is_last_vertical_rect = 1; + } + } + + xd->is_first_horizontal_rect = 0; + if (xd->width > xd->height) + if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1; +} + +static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi) { + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + return tile_ctx->kf_y_cdf[above_ctx][left_ctx]; +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col; + PARTITION_CONTEXT *const left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + memset(above_ctx, partition_context_lookup[subsize].above, bw); + memset(left_ctx, partition_context_lookup[subsize].left, bh); +} + +static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize, + int subsampling_x, int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x); + return ref_pos; +} + +static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf, + size_t element) { + assert(cdf != NULL); + return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element]; +} + +static INLINE void partition_gather_horz_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_HORZ); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_B); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void partition_gather_vert_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_VERT); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_B); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (bsize >= BLOCK_8X8) { + const int hbs = mi_size_wide[bsize] / 2; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + switch (partition) { + case PARTITION_SPLIT: + if (bsize != BLOCK_8X8) break; + AOM_FALLTHROUGH_INTENDED; + case PARTITION_NONE: + case PARTITION_HORZ: + case PARTITION_VERT: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + update_partition_context(xd, mi_row, mi_col, subsize, bsize); + break; + case PARTITION_HORZ_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize); + break; + case PARTITION_HORZ_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize); + break; + case PARTITION_VERT_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize); + break; + case PARTITION_VERT_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize); + break; + default: assert(0 && "Invalid partition type"); + } + } +} + +static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col; + const PARTITION_CONTEXT *left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + // Minimum partition point is 8x8. Offset the bsl accordingly. + const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8]; + int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; + + assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]); + assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +// Return the number of elements in the partition CDF when +// partitioning the (square) block with luma block size of bsize. +static INLINE int partition_cdf_length(BLOCK_SIZE bsize) { + if (bsize <= BLOCK_8X8) + return PARTITION_TYPES; + else if (bsize == BLOCK_128X128) + return EXT_PARTITION_TYPES - 2; + else + return EXT_PARTITION_TYPES; +} + +static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + assert(bsize < BLOCK_SIZES_ALL); + int max_blocks_wide = block_size_wide[bsize]; + + if (xd->mb_to_right_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); + } + + // Scale the width in the transform block unit. + return max_blocks_wide >> MI_SIZE_LOG2; +} + +static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + int max_blocks_high = block_size_high[bsize]; + + if (xd->mb_to_bottom_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); + } + + // Scale the height in the transform block unit. + return max_blocks_high >> MI_SIZE_LOG2; +} + +static INLINE void av1_zero_above_context(AV1_COMMON *const cm, + const MACROBLOCKD *xd, + int mi_col_start, int mi_col_end, + const int tile_row) { + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + const int width = mi_col_end - mi_col_start; + const int aligned_width = + ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); + const int offset_y = mi_col_start; + const int width_y = aligned_width; + const int offset_uv = offset_y >> seq_params->subsampling_x; + const int width_uv = width_y >> seq_params->subsampling_x; + CommonContexts *const above_contexts = &cm->above_contexts; + + av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y); + if (num_planes > 1) { + if (above_contexts->entropy[1][tile_row] && + above_contexts->entropy[2][tile_row]) { + av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv, + width_uv); + av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv, + width_uv); + } else { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid value of planes"); + } + } + + av1_zero_array(above_contexts->partition[tile_row] + mi_col_start, + aligned_width); + + memset(above_contexts->txfm[tile_row] + mi_col_start, + tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT)); +} + +static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) { + av1_zero(xd->left_entropy_context); + av1_zero(xd->left_partition_context); + + memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST], + sizeof(xd->left_txfm_context_buffer)); +} + +static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { + int i; + for (i = 0; i < len; ++i) txfm_ctx[i] = txs; +} + +static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, + const MACROBLOCKD *xd) { + uint8_t bw = tx_size_wide[tx_size]; + uint8_t bh = tx_size_high[tx_size]; + + if (skip) { + bw = n4_w * MI_SIZE; + bh = n4_h * MI_SIZE; + } + + set_txfm_ctx(xd->above_txfm_context, bw, n4_w); + set_txfm_ctx(xd->left_txfm_context, bh, n4_h); +} + +static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + return mi_row * mi_params->mi_stride + mi_col; +} + +static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_row = mi_row / mi_alloc_size_1d; + const int mi_alloc_col = mi_col / mi_alloc_size_1d; + + return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col; +} + +// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi. +static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params, + MACROBLOCKD *const xd, int mi_row, + int mi_col) { + // 'mi_grid_base' should point to appropriate memory in 'mi'. + const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); + mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx]; + // 'xd->mi' should point to an offset in 'mi_grid_base'; + xd->mi = mi_params->mi_grid_base + mi_grid_idx; + // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'. + xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; +} + +static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, + TXFM_CONTEXT *left_ctx, + TX_SIZE tx_size, TX_SIZE txb_size) { + BLOCK_SIZE bsize = txsize_to_bsize[txb_size]; + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + uint8_t txw = tx_size_wide[tx_size]; + uint8_t txh = tx_size_high[tx_size]; + int i; + for (i = 0; i < bh; ++i) left_ctx[i] = txh; + for (i = 0; i < bw; ++i) above_ctx[i] = txw; +} + +static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) { + switch (tx_dim) { + case 128: + case 64: return TX_64X64; break; + case 32: return TX_32X32; break; + case 16: return TX_16X16; break; + case 8: return TX_8X8; break; + default: return TX_4X4; + } +} + +static INLINE TX_SIZE get_tx_size(int width, int height) { + if (width == height) { + return get_sqr_tx_size(width); + } + if (width < height) { + if (width + width == height) { + switch (width) { + case 4: return TX_4X8; break; + case 8: return TX_8X16; break; + case 16: return TX_16X32; break; + case 32: return TX_32X64; break; + } + } else { + switch (width) { + case 4: return TX_4X16; break; + case 8: return TX_8X32; break; + case 16: return TX_16X64; break; + } + } + } else { + if (height + height == width) { + switch (height) { + case 4: return TX_8X4; break; + case 8: return TX_16X8; break; + case 16: return TX_32X16; break; + case 32: return TX_64X32; break; + } + } else { + switch (height) { + case 4: return TX_16X4; break; + case 8: return TX_32X8; break; + case 16: return TX_64X16; break; + } + } + } + assert(0); + return TX_4X4; +} + +static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx, + const TXFM_CONTEXT *const left_ctx, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + const uint8_t txw = tx_size_wide[tx_size]; + const uint8_t txh = tx_size_high[tx_size]; + const int above = *above_ctx < txw; + const int left = *left_ctx < txh; + int category = TXFM_PARTITION_CONTEXTS; + + // dummy return, not used by others. + if (tx_size <= TX_4X4) return 0; + + TX_SIZE max_tx_size = + get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize])); + + if (max_tx_size >= TX_8X8) { + category = + (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) + + (TX_SIZES - 1 - max_tx_size) * 2; + } + assert(category != TXFM_PARTITION_CONTEXTS); + return category * 3 + above + left; +} + +// Compute the next partition in the direction of the sb_type stored in the mi +// array, starting with bsize. +static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) + return PARTITION_INVALID; + + const int offset = mi_row * mi_params->mi_stride + mi_col; + MB_MODE_INFO **mi = mi_params->mi_grid_base + offset; + const BLOCK_SIZE subsize = mi[0]->bsize; + + assert(bsize < BLOCK_SIZES_ALL); + + if (subsize == bsize) return PARTITION_NONE; + + const int bhigh = mi_size_high[bsize]; + const int bwide = mi_size_wide[bsize]; + const int sshigh = mi_size_high[subsize]; + const int sswide = mi_size_wide[subsize]; + + if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows && + mi_col + bhigh / 2 < mi_params->mi_cols) { + // In this case, the block might be using an extended partition + // type. + const MB_MODE_INFO *const mbmi_right = mi[bwide / 2]; + const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride]; + + if (sswide == bwide) { + // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or + // PARTITION_HORZ_B. To distinguish the latter two, check if the lower + // half was split. + if (sshigh * 4 == bhigh) return PARTITION_HORZ_4; + assert(sshigh * 2 == bhigh); + + if (mbmi_below->bsize == subsize) + return PARTITION_HORZ; + else + return PARTITION_HORZ_B; + } else if (sshigh == bhigh) { + // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or + // PARTITION_VERT_B. To distinguish the latter two, check if the right + // half was split. + if (sswide * 4 == bwide) return PARTITION_VERT_4; + assert(sswide * 2 == bhigh); + + if (mbmi_right->bsize == subsize) + return PARTITION_VERT; + else + return PARTITION_VERT_B; + } else { + // Smaller width and smaller height. Might be PARTITION_SPLIT or could be + // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both + // dimensions, we immediately know this is a split (which will recurse to + // get to subsize). Otherwise look down and to the right. With + // PARTITION_VERT_A, the right block will have height bhigh; with + // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise + // it's PARTITION_SPLIT. + if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT; + + if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A; + if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A; + + return PARTITION_SPLIT; + } + } + const int vert_split = sswide < bwide; + const int horz_split = sshigh < bhigh; + const int split_idx = (vert_split << 1) | horz_split; + assert(split_idx != 0); + + static const PARTITION_TYPE base_partitions[4] = { + PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT + }; + + return base_partitions[split_idx]; +} + +static INLINE void set_sb_size(SequenceHeader *const seq_params, + BLOCK_SIZE sb_size) { + seq_params->sb_size = sb_size; + seq_params->mib_size = mi_size_wide[seq_params->sb_size]; + seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size]; +} + +// Returns true if the frame is fully lossless at the coded resolution. +// Note: If super-resolution is used, such a frame will still NOT be lossless at +// the upscaled resolution. +static INLINE int is_coded_lossless(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + int coded_lossless = 1; + if (cm->seg.enabled) { + for (int i = 0; i < MAX_SEGMENTS; ++i) { + if (!xd->lossless[i]) { + coded_lossless = 0; + break; + } + } + } else { + coded_lossless = xd->lossless[0]; + } + return coded_lossless; +} + +static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) { + return seq_level_idx == SEQ_LEVEL_MAX || + (seq_level_idx < SEQ_LEVELS && + // The following levels are currently undefined. + seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 && + seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 && + seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 +#if !CONFIG_CWG_C013 + && seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 && + seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3 && + seq_level_idx != SEQ_LEVEL_8_0 && seq_level_idx != SEQ_LEVEL_8_1 && + seq_level_idx != SEQ_LEVEL_8_2 && seq_level_idx != SEQ_LEVEL_8_3 +#endif + ); +} + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c new file mode 100644 index 0000000000..8d69efcd2d --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm1d.c @@ -0,0 +1,1841 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_txfm.h" + +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 4; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[2]; + bf1[2] = input[1]; + bf1[3] = input[3]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); +} + +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 8; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[4]; + bf1[2] = input[2]; + bf1[3] = input[6]; + bf1[4] = input[1]; + bf1[5] = input[5]; + bf1[6] = input[3]; + bf1[7] = input[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); +} + +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 16; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[8]; + bf1[2] = input[4]; + bf1[3] = input[12]; + bf1[4] = input[2]; + bf1[5] = input[10]; + bf1[6] = input[6]; + bf1[7] = input[14]; + bf1[8] = input[1]; + bf1[9] = input[9]; + bf1[10] = input[5]; + bf1[11] = input[13]; + bf1[12] = input[3]; + bf1[13] = input[11]; + bf1[14] = input[7]; + bf1[15] = input[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); +} + +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 32; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[16]; + bf1[2] = input[8]; + bf1[3] = input[24]; + bf1[4] = input[4]; + bf1[5] = input[20]; + bf1[6] = input[12]; + bf1[7] = input[28]; + bf1[8] = input[2]; + bf1[9] = input[18]; + bf1[10] = input[10]; + bf1[11] = input[26]; + bf1[12] = input[6]; + bf1[13] = input[22]; + bf1[14] = input[14]; + bf1[15] = input[30]; + bf1[16] = input[1]; + bf1[17] = input[17]; + bf1[18] = input[9]; + bf1[19] = input[25]; + bf1[20] = input[5]; + bf1[21] = input[21]; + bf1[22] = input[13]; + bf1[23] = input[29]; + bf1[24] = input[3]; + bf1[25] = input[19]; + bf1[26] = input[11]; + bf1[27] = input[27]; + bf1[28] = input[7]; + bf1[29] = input[23]; + bf1[30] = input[15]; + bf1[31] = input[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); + bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); + bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); + bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); + bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); + bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); + bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); + bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); + bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); + bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); + bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); + bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); + bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); + bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); + bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); + bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); + bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); + bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); + bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); + bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); + bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); + bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); + bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); + bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); +} + +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + int32_t x0 = input[0]; + int32_t x1 = input[1]; + int32_t x2 = input[2]; + int32_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + assert(sinpi[1] + sinpi[2] == sinpi[4]); + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit); + s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit); + s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit); + s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit); + s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit); + s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit); + s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit); + + // stage 2 + // NOTICE: (x0 - x2) here may use one extra bit compared to the + // opt_range_row/col specified in av1_gen_inv_stage_range() + s7 = range_check_value((x0 - x2) + x3, stage_range[2]); + + // stage 3 + s0 = range_check_value(s0 + s3, stage_range[3] + bit); + s1 = range_check_value(s1 - s4, stage_range[3] + bit); + s3 = range_check_value(s2, stage_range[3] + bit); + s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit); + + // stage 4 + s0 = range_check_value(s0 + s5, stage_range[4] + bit); + s1 = range_check_value(s1 - s6, stage_range[4] + bit); + + // stage 5 + x0 = range_check_value(s0 + s3, stage_range[5] + bit); + x1 = range_check_value(s1 + s3, stage_range[5] + bit); + x2 = range_check_value(s2, stage_range[5] + bit); + x3 = range_check_value(s0 + s1, stage_range[5] + bit); + + // stage 6 + x3 = range_check_value(x3 - s3, stage_range[6] + bit); + + output[0] = round_shift(x0, bit); + output[1] = round_shift(x1, bit); + output[2] = round_shift(x2, bit); + output[3] = round_shift(x3, bit); +} + +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 8; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[7]; + bf1[1] = input[0]; + bf1[2] = input[5]; + bf1[3] = input[2]; + bf1[4] = input[3]; + bf1[5] = input[4]; + bf1[6] = input[1]; + bf1[7] = input[6]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); + bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); + bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); + bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = -bf0[4]; + bf1[2] = bf0[6]; + bf1[3] = -bf0[2]; + bf1[4] = bf0[3]; + bf1[5] = -bf0[7]; + bf1[6] = bf0[5]; + bf1[7] = -bf0[1]; +} + +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 16; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[15]; + bf1[1] = input[0]; + bf1[2] = input[13]; + bf1[3] = input[2]; + bf1[4] = input[11]; + bf1[5] = input[4]; + bf1[6] = input[9]; + bf1[7] = input[6]; + bf1[8] = input[7]; + bf1[9] = input[8]; + bf1[10] = input[5]; + bf1[11] = input[10]; + bf1[12] = input[3]; + bf1[13] = input[12]; + bf1[14] = input[1]; + bf1[15] = input[14]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]); + bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); + bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]); + bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); + bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); + bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]); + bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]); + bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = -bf0[8]; + bf1[2] = bf0[12]; + bf1[3] = -bf0[4]; + bf1[4] = bf0[6]; + bf1[5] = -bf0[14]; + bf1[6] = bf0[10]; + bf1[7] = -bf0[2]; + bf1[8] = bf0[3]; + bf1[9] = -bf0[11]; + bf1[10] = bf0[15]; + bf1[11] = -bf0[7]; + bf1[12] = bf0[5]; + bf1[13] = -bf0[13]; + bf1[14] = bf0[9]; + bf1[15] = -bf0[1]; +} + +void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 4; ++i) { + output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits); + } + assert(stage_range[0] + NewSqrt2Bits <= 32); +} + +void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2); +} + +void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); +} + +void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4); +} + +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 64; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[32]; + bf1[2] = input[16]; + bf1[3] = input[48]; + bf1[4] = input[8]; + bf1[5] = input[40]; + bf1[6] = input[24]; + bf1[7] = input[56]; + bf1[8] = input[4]; + bf1[9] = input[36]; + bf1[10] = input[20]; + bf1[11] = input[52]; + bf1[12] = input[12]; + bf1[13] = input[44]; + bf1[14] = input[28]; + bf1[15] = input[60]; + bf1[16] = input[2]; + bf1[17] = input[34]; + bf1[18] = input[18]; + bf1[19] = input[50]; + bf1[20] = input[10]; + bf1[21] = input[42]; + bf1[22] = input[26]; + bf1[23] = input[58]; + bf1[24] = input[6]; + bf1[25] = input[38]; + bf1[26] = input[22]; + bf1[27] = input[54]; + bf1[28] = input[14]; + bf1[29] = input[46]; + bf1[30] = input[30]; + bf1[31] = input[62]; + bf1[32] = input[1]; + bf1[33] = input[33]; + bf1[34] = input[17]; + bf1[35] = input[49]; + bf1[36] = input[9]; + bf1[37] = input[41]; + bf1[38] = input[25]; + bf1[39] = input[57]; + bf1[40] = input[5]; + bf1[41] = input[37]; + bf1[42] = input[21]; + bf1[43] = input[53]; + bf1[44] = input[13]; + bf1[45] = input[45]; + bf1[46] = input[29]; + bf1[47] = input[61]; + bf1[48] = input[3]; + bf1[49] = input[35]; + bf1[50] = input[19]; + bf1[51] = input[51]; + bf1[52] = input[11]; + bf1[53] = input[43]; + bf1[54] = input[27]; + bf1[55] = input[59]; + bf1[56] = input[7]; + bf1[57] = input[39]; + bf1[58] = input[23]; + bf1[59] = input[55]; + bf1[60] = input[15]; + bf1[61] = input[47]; + bf1[62] = input[31]; + bf1[63] = input[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit); + bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit); + bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit); + bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit); + bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit); + bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit); + bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit); + bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit); + bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit); + bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit); + bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); + bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); + bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]); + bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]); + bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]); + bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]); + bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]); + bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]); + bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]); + bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]); + bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]); + bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]); + bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]); + bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]); + bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]); + bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]); + bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]); + bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]); + bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]); + bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]); + bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]); + bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]); + bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]); + bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]); + bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); + bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); + bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); + bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); + bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); + bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); + bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); + bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); + bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit); + bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit); + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]); + bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]); + bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]); + bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]); + bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]); + bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]); + bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]); + bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]); + bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]); + bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]); + bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]); + bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]); + bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]); + bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]); + bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]); + bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]); + bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]); + bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]); + bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]); + bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]); + bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); + bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); + bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); + bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); + bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); + bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); + bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit); + bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit); + bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit); + bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit); + bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); + bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]); + bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]); + bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]); + bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]); + bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]); + bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]); + bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]); + bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]); + bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]); + bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]); + bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]); + bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]); + bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]); + bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]); + bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]); + bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]); + bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]); + bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]); + bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]); + bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); + bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); + bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); + bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); + bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit); + bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit); + bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit); + bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit); + bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit); + bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit); + bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]); + bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]); + bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]); + bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]); + bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]); + bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]); + bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]); + bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]); + bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]); + bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]); + bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]); + bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]); + bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]); + bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]); + bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]); + bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]); + bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]); + bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]); + bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]); + bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); + bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); + bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); + bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]); + bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]); + bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]); + bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]); + bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]); + bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]); + bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]); + bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]); + bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]); + bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]); + bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]); + bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]); + bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]); + bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]); + bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]); + bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]); + bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]); + bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]); + bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]); + bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]); + bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]); + bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]); + bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]); + bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]); +} diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h new file mode 100644 index 0000000000..e1d5d98d10 --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm1d.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE int32_t clamp_value(int32_t value, int8_t bit) { + if (bit <= 0) return value; // Do nothing for invalid clamp bit. + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + return (int32_t)clamp64(value, min_value, max_value); +} + +static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) { + for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit); +} + +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h new file mode 100644 index 0000000000..b4f7801295 --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#include "av1/common/av1_inv_txfm1d.h" + +// sum of fwd_shift_## +static const int8_t inv_start_range[TX_SIZES_ALL] = { + 5, // 4x4 transform + 6, // 8x8 transform + 7, // 16x16 transform + 7, // 32x32 transform + 7, // 64x64 transform + 5, // 4x8 transform + 5, // 8x4 transform + 6, // 8x16 transform + 6, // 16x8 transform + 6, // 16x32 transform + 6, // 32x16 transform + 6, // 32x64 transform + 6, // 64x32 transform + 6, // 4x16 transform + 6, // 16x4 transform + 7, // 8x32 transform + 7, // 32x8 transform + 7, // 16x64 transform + 7, // 64x16 transform +}; + +extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL]; + +// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12 +// for each valid row and col combination +#define INV_COS_BIT 12 + +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c new file mode 100644 index 0000000000..ee67dffe23 --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm2d.c @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" + +void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_low_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0] >> UNIT_QUANT_SHIFT; + c1 = ip[4 * 1] >> UNIT_QUANT_SHIFT; + d1 = ip[4 * 2] >> UNIT_QUANT_SHIFT; + b1 = ip[4 * 3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + + op[4 * 0] = a1; + op[4 * 1] = b1; + op[4 * 2] = c1; + op[4 * 3] = d1; + ip++; + op++; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[0]; + c1 = ip[1]; + d1 = ip[2]; + b1 = ip[3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + + range_check_value(a1, bd + 1); + range_check_value(b1, bd + 1); + range_check_value(c1, bd + 1); + range_check_value(d1, bd + 1); + + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); + + ip += 4; + dest++; + } +} + +void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_low_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void)bd; + + a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = a1; + op[1] = op[2] = op[3] = e1; + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = + highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = + highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = + highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = + highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_idct4; + case TXFM_TYPE_DCT8: return av1_idct8; + case TXFM_TYPE_DCT16: return av1_idct16; + case TXFM_TYPE_DCT32: return av1_idct32; + case TXFM_TYPE_DCT64: return av1_idct64; + case TXFM_TYPE_ADST4: return av1_iadst4; + case TXFM_TYPE_ADST8: return av1_iadst8; + case TXFM_TYPE_ADST16: return av1_iadst16; + case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c; + default: assert(0); return NULL; + } +} + +static const int8_t inv_shift_4x4[2] = { 0, -4 }; +static const int8_t inv_shift_8x8[2] = { -1, -4 }; +static const int8_t inv_shift_16x16[2] = { -2, -4 }; +static const int8_t inv_shift_32x32[2] = { -2, -4 }; +static const int8_t inv_shift_64x64[2] = { -2, -4 }; +static const int8_t inv_shift_4x8[2] = { 0, -4 }; +static const int8_t inv_shift_8x4[2] = { 0, -4 }; +static const int8_t inv_shift_8x16[2] = { -1, -4 }; +static const int8_t inv_shift_16x8[2] = { -1, -4 }; +static const int8_t inv_shift_16x32[2] = { -1, -4 }; +static const int8_t inv_shift_32x16[2] = { -1, -4 }; +static const int8_t inv_shift_32x64[2] = { -1, -4 }; +static const int8_t inv_shift_64x32[2] = { -1, -4 }; +static const int8_t inv_shift_4x16[2] = { -1, -4 }; +static const int8_t inv_shift_16x4[2] = { -1, -4 }; +static const int8_t inv_shift_8x32[2] = { -2, -4 }; +static const int8_t inv_shift_32x8[2] = { -2, -4 }; +static const int8_t inv_shift_16x64[2] = { -2, -4 }; +static const int8_t inv_shift_64x16[2] = { -2, -4 }; + +const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = { + inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32, + inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, + inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64, + inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32, + inv_shift_32x8, inv_shift_16x64, inv_shift_64x16, +}; + +static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; + +void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + cfg->shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + cfg->cos_bit_col = INV_COS_BIT; + cfg->cos_bit_row = INV_COS_BIT; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + if (cfg->txfm_type_col == TXFM_TYPE_ADST4) { + memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range)); + } + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + if (cfg->txfm_type_row == TXFM_TYPE_ADST4) { + memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range)); + } + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; +} + +void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, + int bd) { + const int fwd_shift = inv_start_range[tx_size]; + const int8_t *shift = cfg->shift; + int8_t opt_range_row, opt_range_col; + if (bd == 8) { + opt_range_row = 16; + opt_range_col = 16; + } else if (bd == 10) { + opt_range_row = 18; + opt_range_col = 16; + } else { + assert(bd == 12); + opt_range_row = 20; + opt_range_col = 18; + } + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1; + (void)real_range_row; + if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { + // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 + // so opt_range_row >= real_range_row will not hold + stage_range_row[i] = opt_range_row; + } else { + assert(opt_range_row >= real_range_row); + stage_range_row[i] = opt_range_row; + } + } + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + int real_range_col = + cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; + (void)real_range_col; + if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { + // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 + // so opt_range_col >= real_range_col will not hold + stage_range_col[i] = opt_range_col; + } else { + assert(opt_range_col >= real_range_col); + stage_range_col[i] = opt_range_col; + } + } +} + +static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output, + int stride, TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf, TX_SIZE tx_size, + int bd) { + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row); + + // txfm_buf's length is txfm_size_row * txfm_size_col + 2 * + // AOMMAX(txfm_size_row, txfm_size_col) + // it is used for intermediate data buffering + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_in = txfm_buf; + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + int c, r; + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + if (abs(rect_type) == 1) { + for (c = 0; c < txfm_size_col; ++c) { + temp_in[c] = round_shift( + (int64_t)input[c * txfm_size_row + r] * NewInvSqrt2, NewSqrt2Bits); + } + clamp_buf(temp_in, txfm_size_col, bd + 8); + txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); + } else { + for (c = 0; c < txfm_size_col; ++c) { + temp_in[c] = input[c * txfm_size_row + r]; + } + clamp_buf(temp_in, txfm_size_col, bd + 8); + txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + buf_ptr += txfm_size_col; + } + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16)); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output, + int stride, int32_t *txfm_buf, + TX_TYPE tx_type, TX_SIZE tx_size, + int bd) { + TXFM_2D_FLIP_CFG cfg; + av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg); + // Forward shift sum uses larger square size, to be consistent with what + // av1_gen_inv_stage_range() does for inverse shifts. + inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd); +} + +void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd); +} + +void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd); +} + +void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd); +} + +void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd); +} + +void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd); +} + +void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd); +} + +void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd); +} + +void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd); +} + +void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd); +} + +void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd); +} + +void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // TODO(urvang): Can the same array be reused, instead of using a new array? + // Remap 32x32 input into a modified 64x64 by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 64]; + for (int col = 0; col < 32; ++col) { + memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); + memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64, + bd); +} + +void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x32 input into a modified 64x32 by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[32 * 64]; + memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input)); + memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32, + bd); +} + +void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x32 input into a modified 32x64 input by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 32]; + for (int col = 0; col < 32; ++col) { + memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); + memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64, + bd); +} + +void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 16x32 input into a modified 16x64 input by: + // - Copying over these values in top-left 16x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 16]; + for (int col = 0; col < 16; ++col) { + memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); + memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64, + bd); +} + +void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x16 input into a modified 64x16 by: + // - Copying over these values in top-left 32x16 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[16 * 64]; + memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input)); + memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16, + bd); +} + +void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd); +} + +void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd); +} + +void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd); +} + +void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd); +} diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c new file mode 100644 index 0000000000..5af025c654 --- /dev/null +++ b/third_party/aom/av1/common/av1_loopfilter.c @@ -0,0 +1,2099 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" + +enum { + USE_SINGLE, + USE_DUAL, + USE_QUAD, +} UENUM1BYTE(USE_FILTER_TYPE); + +static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = { + { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H }, + { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U }, + { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V } +}; + +static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 }, + { 2, 2 }, + { 3, 3 } }; + +static const int mode_lf_lut[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1, // INTER_MODES (GLOBALMV == 0) + 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0) +}; + +static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { + int lvl; + + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { + // Set loop filter parameters that control sharpness. + int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) block_inside_limit = 1; + + memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +uint8_t av1_get_filter_level(const AV1_COMMON *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi) { + const int segment_id = mbmi->segment_id; + if (cm->delta_q_info.delta_lf_present_flag) { + int8_t delta_lf; + if (cm->delta_q_info.delta_lf_multi) { + const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx]; + delta_lf = mbmi->delta_lf[delta_lf_idx]; + } else { + delta_lf = mbmi->delta_lf_from_base; + } + int base_level; + if (plane == 0) + base_level = cm->lf.filter_level[dir_idx]; + else if (plane == 1) + base_level = cm->lf.filter_level_u; + else + base_level = cm->lf.filter_level_v; + int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER); + assert(plane >= 0 && plane <= 2); + const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx]; + if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) { + const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id); + lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); + } + + if (cm->lf.mode_ref_delta_enabled) { + const int scale = 1 << (lvl_seg >> 5); + lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale; + if (mbmi->ref_frame[0] > INTRA_FRAME) + lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale; + lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER); + } + return lvl_seg; + } else { + return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]] + [mode_lf_lut[mbmi->mode]]; + } +} + +void av1_loop_filter_init(AV1_COMMON *cm) { + assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut)); + loop_filter_info_n *lfi = &cm->lf_info; + struct loopfilter *lf = &cm->lf; + int lvl; + + // init limits for given sharpness + update_sharpness(lfi, lf->sharpness_level); + + // init hev threshold const vectors + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); +} + +// Update the loop filter for the current frame. +// This should be called before loop_filter_rows(), +// av1_loop_filter_frame() calls this function directly. +void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, + int plane_end) { + int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE]; + int plane; + int seg_id; + // n_shift is the multiplier for lf_deltas + // the multiplier is 1 for when filter_lvl is between 0 and 31; + // 2 when filter_lvl is between 32 and 63 + loop_filter_info_n *const lfi = &cm->lf_info; + struct loopfilter *const lf = &cm->lf; + const struct segmentation *const seg = &cm->seg; + + // update sharpness limits + update_sharpness(lfi, lf->sharpness_level); + + filt_lvl[0] = cm->lf.filter_level[0]; + filt_lvl[1] = cm->lf.filter_level_u; + filt_lvl[2] = cm->lf.filter_level_v; + + filt_lvl_r[0] = cm->lf.filter_level[1]; + filt_lvl_r[1] = cm->lf.filter_level_u; + filt_lvl_r[2] = cm->lf.filter_level_v; + + assert(plane_start >= AOM_PLANE_Y); + assert(plane_end <= MAX_MB_PLANE); + + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) + break; + else if (plane == 1 && !filt_lvl[1]) + continue; + else if (plane == 2 && !filt_lvl[2]) + continue; + + for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { + for (int dir = 0; dir < 2; ++dir) { + int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane]; + const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir]; + if (segfeature_active(seg, seg_id, seg_lf_feature_id)) { + const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id); + lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); + } + + if (!lf->mode_ref_delta_enabled) { + // we could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas + memset(lfi->lvl[plane][seg_id][dir], lvl_seg, + sizeof(lfi->lvl[plane][seg_id][dir])); + } else { + int ref, mode; + const int scale = 1 << (lvl_seg >> 5); + const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; + lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] = + clamp(intra_lvl, 0, MAX_LOOP_FILTER); + + for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) { + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + + lf->mode_deltas[mode] * scale; + lfi->lvl[plane][seg_id][dir][ref][mode] = + clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } + } + } + } + } + } +} + +static AOM_FORCE_INLINE TX_SIZE +get_transform_size(const MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi, + const int mi_row, const int mi_col, const int plane, + const int ss_x, const int ss_y) { + assert(mbmi != NULL); + if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4; + + TX_SIZE tx_size = (plane == AOM_PLANE_Y) + ? mbmi->tx_size + : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y); + assert(tx_size < TX_SIZES_ALL); + if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) { + const BLOCK_SIZE sb_type = mbmi->bsize; + const int blk_row = mi_row & (mi_size_high[sb_type] - 1); + const int blk_col = mi_col & (mi_size_wide[sb_type] - 1); + const TX_SIZE mb_tx_size = + mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)]; + assert(mb_tx_size < TX_SIZES_ALL); + tx_size = mb_tx_size; + } + + return tx_size; +} + +static const int tx_dim_to_filter_length[TX_SIZES] = { 4, 8, 14, 14, 14 }; + +// Return TX_SIZE from get_transform_size(), so it is plane and direction +// aware +static TX_SIZE set_lpf_parameters( + AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y, + const int plane, const struct macroblockd_plane *const plane_ptr) { + // reset to initial values + params->filter_length = 0; + + // no deblocking is required + const uint32_t width = plane_ptr->dst.width; + const uint32_t height = plane_ptr->dst.height; + if ((width <= x) || (height <= y)) { + // just return the smallest transform unit size + return TX_4X4; + } + + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + // for sub8x8 block, chroma prediction mode is obtained from the bottom/right + // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row + // and mi_col should map to the bottom/right mi structure, i.e, both mi_row + // and mi_col should be odd number for chroma plane. + const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2); + const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2); + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + // If current mbmi is not correctly setup, return an invalid value to stop + // filtering. One example is that if this tile is not coded, then its mbmi + // it not set up. + if (mbmi == NULL) return TX_INVALID; + + const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane, + scale_horz, scale_vert); + + { + const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y); + const uint32_t transform_masks = + edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = (coord & transform_masks) ? (0) : (1); + + if (!tu_edge) return ts; + + // prepare outer edge parameters. deblock the edge if it's an edge of a TU + { + const uint32_t curr_level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); + const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); + uint32_t level = curr_level; + if (coord) { + { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + if (mi_prev == NULL) return TX_INVALID; + const int pv_row = + (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert)); + const int pv_col = + (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col); + const TX_SIZE pv_ts = get_transform_size( + xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert); + + const uint32_t pv_lvl = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); + + const int pv_skip_txfm = + mi_prev->skip_txfm && is_inter_block(mi_prev); + const BLOCK_SIZE bsize = get_plane_block_size( + mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y); + assert(bsize < BLOCK_SIZES_ALL); + const int prediction_masks = edge_dir == VERT_EDGE + ? block_size_wide[bsize] - 1 + : block_size_high[bsize] - 1; + const int32_t pu_edge = !(coord & prediction_masks); + // if the current and the previous blocks are skipped, + // deblock the edge if the edge belongs to a PU's edge only. + if ((curr_level || pv_lvl) && + (!pv_skip_txfm || !curr_skipped || pu_edge)) { + const int dim = (VERT_EDGE == edge_dir) + ? AOMMIN(tx_size_wide_unit_log2[ts], + tx_size_wide_unit_log2[pv_ts]) + : AOMMIN(tx_size_high_unit_log2[ts], + tx_size_high_unit_log2[pv_ts]); + if (plane) { + params->filter_length = (dim == 0) ? 4 : 6; + } else { + assert(dim < TX_SIZES); + assert(dim >= 0); + params->filter_length = tx_dim_to_filter_length[dim]; + } + + // update the level if the current block is skipped, + // but the previous one is not + level = (curr_level) ? (curr_level) : (pv_lvl); + } + } + } + // prepare common parameters + if (params->filter_length) { + const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; + params->lfthr = limits; + } + } + } + + return ts; +} + +static const uint32_t vert_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_16X16 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_32X32 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_64X64 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_4X8 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X4 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_8X16 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_16X8 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_16X32 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_32X16 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_32X64 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_64X32 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_4X16 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_16X4 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_8X32 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_32X8 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_16X64 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_64X16 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, +}; + +static const uint32_t horz_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_16X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_64X64 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_4X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_8X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_16X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_16X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X64 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_64X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_4X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_16X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_16X64 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_64X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, +}; + +static const uint32_t vert_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_64X64 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_4X8 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X4 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_8X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X8 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X64 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_64X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_4X16 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_16X4 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_8X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X8 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X64 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_64X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, +}; + +static const uint32_t horz_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_64X64 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_4X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_8X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X64 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_64X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_4X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X64 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_64X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, +}; + +static AOM_FORCE_INLINE void set_one_param_for_line_luma( + AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, + const struct macroblockd_plane *const plane_ptr, int coord, + bool is_first_block, TX_SIZE prev_tx_size, const ptrdiff_t mode_step, + int *min_dim) { + (void)plane_ptr; + assert(mi_col << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.width && + mi_row << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.height); + const int is_vert = edge_dir == VERT_EDGE; + // reset to initial values + params->filter_length = 0; + + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + assert(mbmi); + + const TX_SIZE ts = + get_transform_size(xd, mi[0], mi_row, mi_col, AOM_PLANE_Y, 0, 0); + +#ifndef NDEBUG + const uint32_t transform_masks = + is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1); + assert(tu_edge); +#endif // NDEBUG + // If we are not the first block, then coord is always true, so + // !is_first_block is technically redundant. But we are keeping it here so the + // compiler can compile away this conditional if we pass in is_first_block := + // false + bool curr_skipped = false; + if (!is_first_block || coord) { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + const int pv_row = is_vert ? mi_row : (mi_row - 1); + const int pv_col = is_vert ? (mi_col - 1) : mi_col; + const TX_SIZE pv_ts = + is_first_block + ? get_transform_size(xd, mi_prev, pv_row, pv_col, AOM_PLANE_Y, 0, 0) + : prev_tx_size; + if (is_first_block) { + *min_dim = is_vert ? block_size_high[mi_prev->bsize] + : block_size_wide[mi_prev->bsize]; + } + assert(mi_prev); + uint8_t level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi); + if (!level) { + level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, + mi_prev); + } + + const int32_t pu_edge = mi_prev != mbmi; + + // The quad loop filter assumes that all the transform blocks within a + // 8x16/16x8/16x16 prediction block are of the same size. + assert(IMPLIES( + !pu_edge && (mbmi->bsize >= BLOCK_8X16 && mbmi->bsize <= BLOCK_16X16), + pv_ts == ts)); + + if (!pu_edge) { + curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); + } + if ((pu_edge || !curr_skipped) && level) { + params->filter_length = is_vert ? vert_filter_length_luma[ts][pv_ts] + : horz_filter_length_luma[ts][pv_ts]; + + // prepare common parameters + const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; + params->lfthr = limits; + } + } + const int block_dim = + is_vert ? block_size_high[mbmi->bsize] : block_size_wide[mbmi->bsize]; + *min_dim = AOMMIN(*min_dim, block_dim); + + *tx_size = ts; +} + +// Similar to set_lpf_parameters, but does so one row/col at a time to reduce +// calls to \ref get_transform_size and \ref av1_get_filter_level +static AOM_FORCE_INLINE void set_lpf_parameters_for_line_luma( + AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, + const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range, + const ptrdiff_t mode_step, int *min_dim) { + const int is_vert = edge_dir == VERT_EDGE; + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row; + TX_SIZE prev_tx_size = TX_INVALID; + + // Unroll the first iteration of the loop + set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row, + plane_ptr, *counter_ptr, true, prev_tx_size, + mode_step, min_dim); + + // Advance + int advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units; + params += advance_units; + tx_size += advance_units; + + while (*counter_ptr < mi_range) { + set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, + mi_row, plane_ptr, *counter_ptr, false, + prev_tx_size, mode_step, min_dim); + + // Advance + advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units; + params += advance_units; + tx_size += advance_units; + } +} + +static AOM_FORCE_INLINE void set_one_param_for_line_chroma( + AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, int coord, + bool is_first_block, TX_SIZE prev_tx_size, + const struct macroblockd_plane *const plane_ptr, const ptrdiff_t mode_step, + const int scale_horz, const int scale_vert, int *min_dim, int plane, + int joint_filter_chroma) { + const int is_vert = edge_dir == VERT_EDGE; + (void)plane_ptr; + assert((mi_col << MI_SIZE_LOG2) < + (uint32_t)(plane_ptr->dst.width << scale_horz) && + (mi_row << MI_SIZE_LOG2) < + (uint32_t)(plane_ptr->dst.height << scale_vert)); + // reset to initial values + params->filter_length = 0; + + // for sub8x8 block, chroma prediction mode is obtained from the + // bottom/right mi structure of the co-located 8x8 luma block. so for chroma + // plane, mi_row and mi_col should map to the bottom/right mi structure, + // i.e, both mi_row and mi_col should be odd number for chroma plane. + mi_row |= scale_vert; + mi_col |= scale_horz; + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + assert(mbmi); + + const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane, + scale_horz, scale_vert); + *tx_size = ts; + +#ifndef NDEBUG + const uint32_t transform_masks = + is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1); + assert(tu_edge); +#endif // NDEBUG + + // If we are not the first block, then coord is always true, so + // !is_first_block is technically redundant. But we are keeping it here so the + // compiler can compile away this conditional if we pass in is_first_block := + // false + bool curr_skipped = false; + if (!is_first_block || coord) { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + assert(mi_prev); + const int pv_row = is_vert ? (mi_row) : (mi_row - (1 << scale_vert)); + const int pv_col = is_vert ? (mi_col - (1 << scale_horz)) : (mi_col); + const TX_SIZE pv_ts = + is_first_block ? get_transform_size(xd, mi_prev, pv_row, pv_col, plane, + scale_horz, scale_vert) + : prev_tx_size; + if (is_first_block) { + *min_dim = is_vert ? tx_size_high[pv_ts] : tx_size_wide[pv_ts]; + } + + uint8_t level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); + if (!level) { + level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); + } +#ifndef NDEBUG + if (joint_filter_chroma) { + uint8_t v_level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi); + if (!v_level) { + v_level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, + mi_prev); + } + assert(level == v_level); + } +#else + (void)joint_filter_chroma; +#endif // NDEBUG + const int32_t pu_edge = mi_prev != mbmi; + + if (!pu_edge) { + curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); + } + // For realtime mode, u and v have the same level + if ((!curr_skipped || pu_edge) && level) { + params->filter_length = is_vert ? vert_filter_length_chroma[ts][pv_ts] + : horz_filter_length_chroma[ts][pv_ts]; + + const loop_filter_thresh *const limits = cm->lf_info.lfthr; + params->lfthr = limits + level; + } + } + const int tx_dim = is_vert ? tx_size_high[ts] : tx_size_wide[ts]; + *min_dim = AOMMIN(*min_dim, tx_dim); +} + +static AOM_FORCE_INLINE void set_lpf_parameters_for_line_chroma( + AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, + const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range, + const ptrdiff_t mode_step, const int scale_horz, const int scale_vert, + int *min_dim, int plane, int joint_filter_chroma) { + const int is_vert = edge_dir == VERT_EDGE; + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row; + const uint32_t scale = is_vert ? scale_horz : scale_vert; + TX_SIZE prev_tx_size = TX_INVALID; + + // Unroll the first iteration of the loop + set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col, + mi_row, *counter_ptr, true, prev_tx_size, + plane_ptr, mode_step, scale_horz, scale_vert, + min_dim, plane, joint_filter_chroma); + + // Advance + int advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units << scale; + params += advance_units; + tx_size += advance_units; + + while (*counter_ptr < mi_range) { + set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col, + mi_row, *counter_ptr, false, prev_tx_size, + plane_ptr, mode_step, scale_horz, scale_vert, + min_dim, plane, joint_filter_chroma); + + // Advance + advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units << scale; + params += advance_units; + tx_size += advance_units; + } +} + +static AOM_INLINE void filter_vert(uint8_t *dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, + const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_4_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_6_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_vertical_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_8_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_vertical_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_14_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_vertical_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_vertical_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +static AOM_INLINE void filter_vert_chroma( + uint8_t *u_dst, uint8_t *v_dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *u_limits = params->lfthr; + const loop_filter_thresh *v_limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst); + uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4(u_dst_shortptr, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_4(v_dst_shortptr, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6(u_dst_shortptr, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_6(v_dst_shortptr, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + bit_depth); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_vertical_4_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_vertical_6_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_4_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_6_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + u_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +void av1_filter_block_plane_vert(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + + for (int y = 0; y < y_range; y++) { + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = + set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, + VERT_EDGE, curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + filter_vert(p, dst_stride, ¶ms, cm->seq_params, USE_SINGLE); + + // advance the destination pointer + advance_units = tx_size_wide_unit[tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + } + } +} + +void av1_filter_block_plane_vert_opt( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int plane_mi_cols = + CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2); + const int plane_mi_rows = + CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2); + // Whenever 'pipeline_lpf_mt_with_enc' is enabled, height of the unit to + // filter (i.e., y_range) is calculated based on the size of the superblock + // used. + const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), + (1 << num_mis_in_lpf_unit_height_log2)); + // Width of the unit to filter (i.e., x_range) should always be calculated + // based on maximum superblock size as this function is called for mi_col = 0, + // MAX_MIB_SIZE, 2 * MAX_MIB_SIZE etc. + const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE); + const ptrdiff_t mode_step = 1; + for (int y = 0; y < y_range; y++) { + const uint32_t curr_y = mi_row + y; + const uint32_t x_start = mi_col; + const uint32_t x_end = mi_col + x_range; + int min_block_height = block_size_high[BLOCK_128X128]; + set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, VERT_EDGE, + x_start, curr_y, plane_ptr, x_end, + mode_step, &min_block_height); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + USE_FILTER_TYPE use_filter_type = USE_SINGLE; + + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + + if ((y & 3) == 0 && (y + 3) < y_range && min_block_height >= 16) { + // If we are on a row which is a multiple of 4, and the minimum height is + // 16 pixels, then the current and right 3 cols must contain the same + // prediction block. This is because dim 16 can only happen every unit of + // 4 mi's. + use_filter_type = USE_QUAD; + y += 3; + } else if ((y + 1) < y_range && min_block_height >= 8) { + use_filter_type = USE_DUAL; + y += 1; + } + + for (int x = 0; x < x_range;) { + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + filter_vert(p, dst_stride, params, cm->seq_params, use_filter_type); + + // advance the destination pointer + const uint32_t advance_units = tx_size_wide_unit[*tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + params += advance_units; + tx_size += advance_units; + } + } +} + +void av1_filter_block_plane_vert_opt_chroma( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int mi_cols = + ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int mi_rows = + ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert); + const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz); + const int y_range = + AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + const ptrdiff_t mode_step = (ptrdiff_t)1 << scale_horz; + + for (int y = 0; y < y_range; y++) { + const uint32_t curr_y = mi_row + (y << scale_vert); + const uint32_t x_start = mi_col + (0 << scale_horz); + const uint32_t x_end = mi_col + (x_range << scale_horz); + int min_height = tx_size_high[TX_64X64]; + set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, VERT_EDGE, + x_start, curr_y, plane_ptr, x_end, + mode_step, scale_horz, scale_vert, + &min_height, plane, joint_filter_chroma); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + int use_filter_type = USE_SINGLE; + int y_inc = 0; + + if ((y & 3) == 0 && (y + 3) < y_range && min_height >= 16) { + // If we are on a row which is a multiple of 4, and the minimum height is + // 16 pixels, then the current and below 3 rows must contain the same tx + // block. This is because dim 16 can only happen every unit of 4 mi's. + use_filter_type = USE_QUAD; + y_inc = 3; + } else if (y % 2 == 0 && (y + 1) < y_range && min_height >= 8) { + // If we are on an even row, and the minimum height is 8 pixels, then the + // current and below rows must contain the same tx block. This is because + // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1, + // etc. + use_filter_type = USE_DUAL; + y_inc = 1; + } + + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE; + if (joint_filter_chroma) { + uint8_t *u_dst = plane_ptr[0].dst.buf + offset; + uint8_t *v_dst = plane_ptr[1].dst.buf + offset; + filter_vert_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params, + use_filter_type); + } else { + uint8_t *dst_ptr = plane_ptr->dst.buf + offset; + filter_vert(dst_ptr, dst_stride, params, cm->seq_params, + use_filter_type); + } + + // advance the destination pointer + const uint32_t advance_units = tx_size_wide_unit[*tx_size]; + x += advance_units; + params += advance_units; + tx_size += advance_units; + } + y += y_inc; + } +} + +static AOM_INLINE void filter_horz(uint8_t *dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, + const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_4_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_6_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_horizontal_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_8_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_horizontal_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_14_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_horizontal_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_horizontal_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +static AOM_INLINE void filter_horz_chroma( + uint8_t *u_dst, uint8_t *v_dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *u_limits = params->lfthr; + const loop_filter_thresh *v_limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst); + uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4(u_dst_shortptr, dst_stride, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4(v_dst_shortptr, dst_stride, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6(u_dst_shortptr, dst_stride, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6(v_dst_shortptr, dst_stride, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_horizontal_4_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_horizontal_6_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_4_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_6_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + u_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +void av1_filter_block_plane_horz(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + for (int x = 0; x < x_range; x++) { + uint8_t *p = dst_ptr + x * MI_SIZE; + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + filter_horz(p, dst_stride, ¶ms, cm->seq_params, USE_SINGLE); + + // advance the destination pointer + advance_units = tx_size_high_unit[tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + } + } +} + +void av1_filter_block_plane_horz_opt( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int plane_mi_cols = + CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2); + const int plane_mi_rows = + CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2); + const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), + (1 << num_mis_in_lpf_unit_height_log2)); + const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE); + + const ptrdiff_t mode_step = cm->mi_params.mi_stride; + for (int x = 0; x < x_range; x++) { + const uint32_t curr_x = mi_col + x; + const uint32_t y_start = mi_row; + const uint32_t y_end = mi_row + y_range; + int min_block_width = block_size_high[BLOCK_128X128]; + set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, HORZ_EDGE, + curr_x, y_start, plane_ptr, y_end, + mode_step, &min_block_width); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + USE_FILTER_TYPE filter_type = USE_SINGLE; + + uint8_t *p = dst_ptr + x * MI_SIZE; + + if ((x & 3) == 0 && (x + 3) < x_range && min_block_width >= 16) { + // If we are on a col which is a multiple of 4, and the minimum width is + // 16 pixels, then the current and right 3 cols must contain the same + // prediction block. This is because dim 16 can only happen every unit of + // 4 mi's. + filter_type = USE_QUAD; + x += 3; + } else if ((x + 1) < x_range && min_block_width >= 8) { + filter_type = USE_DUAL; + x += 1; + } + + for (int y = 0; y < y_range;) { + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + filter_horz(p, dst_stride, params, cm->seq_params, filter_type); + + // advance the destination pointer + const uint32_t advance_units = tx_size_high_unit[*tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + params += advance_units; + tx_size += advance_units; + } + } +} + +void av1_filter_block_plane_horz_opt_chroma( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int mi_cols = + ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int mi_rows = + ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert); + const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz); + const int y_range = + AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + const ptrdiff_t mode_step = cm->mi_params.mi_stride << scale_vert; + for (int x = 0; x < x_range; x++) { + const uint32_t y_start = mi_row + (0 << scale_vert); + const uint32_t curr_x = mi_col + (x << scale_horz); + const uint32_t y_end = mi_row + (y_range << scale_vert); + int min_width = tx_size_wide[TX_64X64]; + set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, HORZ_EDGE, + curr_x, y_start, plane_ptr, y_end, + mode_step, scale_horz, scale_vert, + &min_width, plane, joint_filter_chroma); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + USE_FILTER_TYPE use_filter_type = USE_SINGLE; + int x_inc = 0; + + if ((x & 3) == 0 && (x + 3) < x_range && min_width >= 16) { + // If we are on a col which is a multiple of 4, and the minimum width is + // 16 pixels, then the current and right 3 cols must contain the same tx + // block. This is because dim 16 can only happen every unit of 4 mi's. + use_filter_type = USE_QUAD; + x_inc = 3; + } else if (x % 2 == 0 && (x + 1) < x_range && min_width >= 8) { + // If we are on an even col, and the minimum width is 8 pixels, then the + // current and left cols must contain the same tx block. This is because + // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1, + // etc. + use_filter_type = USE_DUAL; + x_inc = 1; + } + + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE; + if (joint_filter_chroma) { + uint8_t *u_dst = plane_ptr[0].dst.buf + offset; + uint8_t *v_dst = plane_ptr[1].dst.buf + offset; + filter_horz_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params, + use_filter_type); + } else { + uint8_t *dst_ptr = plane_ptr->dst.buf + offset; + filter_horz(dst_ptr, dst_stride, params, cm->seq_params, + use_filter_type); + } + + // advance the destination pointer + const int advance_units = tx_size_high_unit[*tx_size]; + y += advance_units; + params += advance_units; + tx_size += advance_units; + } + x += x_inc; + } +} diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h new file mode 100644 index 0000000000..c9880cf5da --- /dev/null +++ b/third_party/aom/av1/common/av1_loopfilter.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_ +#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_ + +#include "config/aom_config.h" + +#include "aom/internal/aom_codec_internal.h" + +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +#define MAX_SHARPNESS 7 + +#define SIMD_WIDTH 16 + +enum lf_path { + LF_PATH_420, + LF_PATH_444, + LF_PATH_SLOW, +}; + +/*!\cond */ +enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR); +typedef struct { + uint64_t bits[4]; +} FilterMask; + +struct loopfilter { + int filter_level[2]; + int filter_level_u; + int filter_level_v; + + int sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, Last2+Last3, + // GF, BRF, ARF2, ARF + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; +}; + +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; + uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS]; +} loop_filter_info_n; + +typedef struct AV1_DEBLOCKING_PARAMETERS { + // length of the filter applied to the outer edge + uint8_t filter_length; + // deblocking limits + const loop_filter_thresh *lfthr; +} AV1_DEBLOCKING_PARAMETERS; + +typedef struct LoopFilterWorkerData { + YV12_BUFFER_CONFIG *frame_buffer; + struct AV1Common *cm; + struct macroblockd_plane planes[MAX_MB_PLANE]; + // TODO(Ranjit): When the filter functions are modified to use xd->lossless + // add lossless as a member here. + MACROBLOCKD *xd; + + AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE]; + TX_SIZE tx_buf[MAX_MIB_SIZE]; + struct aom_internal_error_info error_info; +} LFWorkerData; +/*!\endcond */ + +/* assorted loopfilter functions which get used elsewhere */ +struct AV1Common; +struct macroblockd; +struct AV1LfSyncData; + +void av1_loop_filter_init(struct AV1Common *cm); + +void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, + int plane_end); + +void av1_filter_block_plane_vert(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col); + +void av1_filter_block_plane_horz(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col); + +void av1_filter_block_plane_vert_opt( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2); + +void av1_filter_block_plane_vert_opt_chroma( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2); + +void av1_filter_block_plane_horz_opt( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2); + +void av1_filter_block_plane_horz_opt_chroma( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2); + +uint8_t av1_get_filter_level(const struct AV1Common *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_ diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c new file mode 100644 index 0000000000..8a35dca369 --- /dev/null +++ b/third_party/aom/av1/common/av1_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_once.h" + +void av1_rtcd(void) { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl new file mode 100644 index 0000000000..c5fe389ba1 --- /dev/null +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -0,0 +1,655 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub av1_common_forward_decls() { +print < 0) { + for (i = 0; i < size; i++) { + arr[i] = round_shift(arr[i], bit); + } + } else { + for (i = 0; i < size; i++) { + arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN, + INT32_MAX); + } + } + } +} + +const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = { + { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 }, + { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 }, + { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 }, + { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, + TXFM_TYPE_IDENTITY32 }, + { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID } +}; + +const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = { + 4, // TXFM_TYPE_DCT4 + 6, // TXFM_TYPE_DCT8 + 8, // TXFM_TYPE_DCT16 + 10, // TXFM_TYPE_DCT32 + 12, // TXFM_TYPE_DCT64 + 7, // TXFM_TYPE_ADST4 + 8, // TXFM_TYPE_ADST8 + 10, // TXFM_TYPE_ADST16 + 1, // TXFM_TYPE_IDENTITY4 + 1, // TXFM_TYPE_IDENTITY8 + 1, // TXFM_TYPE_IDENTITY16 + 1, // TXFM_TYPE_IDENTITY32 +}; + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + + int in_range = 1; + + for (int i = 0; i < size; ++i) { + if (buf[i] < min_value || buf[i] > max_value) { + in_range = 0; + } + } + + if (!in_range) { + fprintf(stderr, "Error: coeffs contain out-of-range values\n"); + fprintf(stderr, "size: %d\n", size); + fprintf(stderr, "stage: %d\n", stage); + fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, + max_value); + + fprintf(stderr, "coeffs: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", input[j]); + } + fprintf(stderr, "]\n"); + + fprintf(stderr, " buf: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", buf[j]); + } + fprintf(stderr, "]\n\n"); + } + + assert(in_range); +#else + (void)stage; + (void)input; + (void)buf; + (void)size; + (void)bit; +#endif +} diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h new file mode 100644 index 0000000000..7ad70af86a --- /dev/null +++ b/third_party/aom/av1/common/av1_txfm.h @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_TXFM_H_ +#define AOM_AV1_COMMON_AV1_TXFM_H_ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "av1/common/enums.h" +#include "av1/common/blockd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if !defined(DO_RANGE_CHECK_CLAMP) +#define DO_RANGE_CHECK_CLAMP 0 +#endif + +extern const int32_t av1_cospi_arr_data[4][64]; +extern const int32_t av1_sinpi_arr_data[4][5]; + +#define MAX_TXFM_STAGE_NUM 12 + +static const int cos_bit_min = 10; + +#define NewSqrt2Bits ((int32_t)12) +// 2^12 * sqrt(2) +static const int32_t NewSqrt2 = 5793; +// 2^12 / sqrt(2) +static const int32_t NewInvSqrt2 = 2896; + +static INLINE const int32_t *cospi_arr(int n) { + return av1_cospi_arr_data[n - cos_bit_min]; +} + +static INLINE const int32_t *sinpi_arr(int n) { + return av1_sinpi_arr_data[n - cos_bit_min]; +} + +// The reduced bit-width and permuted arrays are only used in the Arm Neon +// implementations in av1_fwd_txfm2d_neon.c and highbd_fwd_txfm_neon.c for now. +#if HAVE_NEON +// Store cospi/sinpi costants in Q2.13 format. +// See: https://en.wikipedia.org/wiki/Q_(number_format) +extern const int16_t av1_cospi_arr_q13_data[4][128]; +extern const int16_t av1_sinpi_arr_q13_data[4][4]; + +extern const int32_t av1_cospi_arr_s32_data[4][66]; + +static INLINE const int16_t *cospi_arr_q13(int n) { + return av1_cospi_arr_q13_data[n - cos_bit_min]; +} + +static INLINE const int16_t *sinpi_arr_q13(int n) { + return av1_sinpi_arr_q13_data[n - cos_bit_min]; +} + +static INLINE const int32_t *cospi_arr_s32(int n) { + return av1_cospi_arr_s32_data[n - cos_bit_min]; +} +#endif // HAVE_NEON + +static INLINE int32_t range_check_value(int32_t value, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + if (value < min_value || value > max_value) { + fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit); +#if !CONFIG_AV1_ENCODER + assert(0); +#endif + } +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING +#if DO_RANGE_CHECK_CLAMP + bit = AOMMIN(bit, 31); + return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1); +#endif // DO_RANGE_CHECK_CLAMP + (void)bit; + return value; +} + +static INLINE int32_t round_shift(int64_t value, int bit) { + assert(bit >= 1); + return (int32_t)((value + (1ll << (bit - 1))) >> bit); +} + +static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1, + int bit) { + int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1); + int64_t intermediate = result_64 + (1LL << (bit - 1)); + // NOTE(rachelbarker): The value 'result_64' may not necessarily fit + // into 32 bits. However, the result of this function is nominally + // ROUND_POWER_OF_TWO_64(result_64, bit) + // and that is required to fit into stage_range[stage] many bits + // (checked by range_check_buf()). + // + // Here we've unpacked that rounding operation, and it can be shown + // that the value of 'intermediate' here *does* fit into 32 bits + // for any conformant bitstream. + // The upshot is that, if you do all this calculation using + // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic, + // then you'll still get the correct result. + // To provide a check on this logic, we assert that 'intermediate' + // would fit into an int32 if range checking is enabled. +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX); +#endif + return (int32_t)(intermediate >> bit); +} + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + return clip_pixel_highbd(dest + (int)trans, bd); +} + +typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); + +typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd); + +enum { + TXFM_TYPE_DCT4, + TXFM_TYPE_DCT8, + TXFM_TYPE_DCT16, + TXFM_TYPE_DCT32, + TXFM_TYPE_DCT64, + TXFM_TYPE_ADST4, + TXFM_TYPE_ADST8, + TXFM_TYPE_ADST16, + TXFM_TYPE_IDENTITY4, + TXFM_TYPE_IDENTITY8, + TXFM_TYPE_IDENTITY16, + TXFM_TYPE_IDENTITY32, + TXFM_TYPES, + TXFM_TYPE_INVALID, +} UENUM1BYTE(TXFM_TYPE); + +typedef struct TXFM_2D_FLIP_CFG { + TX_SIZE tx_size; + int ud_flip; // flip upside down + int lr_flip; // flip left to right + const int8_t *shift; + int8_t cos_bit_col; + int8_t cos_bit_row; + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + TXFM_TYPE txfm_type_col; + TXFM_TYPE txfm_type_row; + int stage_num_col; + int stage_num_row; +} TXFM_2D_FLIP_CFG; + +static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + *ud_flip = 0; + *lr_flip = 0; + break; + case IDTX: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + *ud_flip = 0; + *lr_flip = 0; + break; + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: + *ud_flip = 1; + *lr_flip = 0; + break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + *ud_flip = 0; + *lr_flip = 1; + break; + case FLIPADST_FLIPADST: + *ud_flip = 1; + *lr_flip = 1; + break; + default: + *ud_flip = 0; + *lr_flip = 0; + assert(0); + } +} + +static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { + get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip); +} + +// Utility function that returns the log of the ratio of the col and row +// sizes. +static INLINE int get_rect_tx_log_ratio(int col, int row) { + if (col == row) return 0; + if (col > row) { + if (col == row * 2) return 1; + if (col == row * 4) return 2; + assert(0 && "Unsupported transform size"); + } else { + if (row == col * 2) return -1; + if (row == col * 4) return -2; + assert(0 && "Unsupported transform size"); + } + return 0; // Invalid +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd); + +void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, + int bd); + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg); +void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg); +extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D]; +extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES]; +static INLINE int get_txw_idx(TX_SIZE tx_size) { + return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]; +} +static INLINE int get_txh_idx(TX_SIZE tx_size) { + return tx_size_high_log2[tx_size] - tx_size_high_log2[0]; +} + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit); +#define MAX_TXWH_IDX 5 +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // AOM_AV1_COMMON_AV1_TXFM_H_ diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c new file mode 100644 index 0000000000..1d597502ce --- /dev/null +++ b/third_party/aom/av1/common/blockd.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) { + if (!left_mi) return DC_PRED; + assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi)); + return left_mi->mode; +} + +PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { + if (!above_mi) return DC_PRED; + assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi)); + return above_mi->mode; +} + +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff; + const int txs_wide = tx_size_wide_unit[tx_size]; + const int txs_high = tx_size_high_unit[tx_size]; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + const int blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff); + memset(a, has_eob, sizeof(*a) * above_contexts); + memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts)); + } else { + memset(a, has_eob, sizeof(*a) * txs_wide); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + const int blocks_high = max_block_high(xd, plane_bsize, plane); + const int left_contexts = AOMMIN(txs_high, blocks_high - loff); + memset(l, has_eob, sizeof(*l) * left_contexts); + memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts)); + } else { + memset(l, has_eob, sizeof(*l) * txs_high); + } +} +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes) { + assert(bsize < BLOCK_SIZES_ALL); + const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref; + for (int i = 0; i < nplanes; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int txs_wide = mi_size_wide[plane_bsize]; + const int txs_high = mi_size_high[plane_bsize]; + memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide); + memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high); + } +} + +void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) { + xd->delta_lf_from_base = 0; + const int frame_lf_count = + num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0; +} + +void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) { + for (int p = 0; p < num_planes; ++p) { + set_default_wiener(xd->wiener_info + p); + set_default_sgrproj(xd->sgrproj_info + p); + } +} + +void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, + const int num_planes) { + int i; + + for (i = 0; i < num_planes; i++) { + xd->plane[i].plane_type = get_plane_type(i); + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } + for (i = num_planes; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = 1; + xd->plane[i].subsampling_y = 1; + } +} diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h new file mode 100644 index 0000000000..0cfd1f3954 --- /dev/null +++ b/third_party/aom/av1/common/blockd.h @@ -0,0 +1,1612 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_BLOCKD_H_ +#define AOM_AV1_COMMON_BLOCKD_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +#include "av1/common/common_data.h" +#include "av1/common/quant_common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/mv.h" +#include "av1/common/scale.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define USE_B_QUANT_NO_TRELLIS 1 + +#define MAX_MB_PLANE 3 + +#define MAX_DIFFWTD_MASK_BITS 1 + +#define INTERINTRA_WEDGE_SIGN 0 + +#define DEFAULT_INTER_TX_TYPE DCT_DCT + +#define MAX_PALETTE_BLOCK_WIDTH 64 + +#define MAX_PALETTE_BLOCK_HEIGHT 64 + +/*!\cond */ + +// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS +enum { + DIFFWTD_38 = 0, + DIFFWTD_38_INV, + DIFFWTD_MASK_TYPES, +} UENUM1BYTE(DIFFWTD_MASK_TYPE); + +enum { + KEY_FRAME = 0, + INTER_FRAME = 1, + INTRA_ONLY_FRAME = 2, // replaces intra-only + S_FRAME = 3, + FRAME_TYPES, +} UENUM1BYTE(FRAME_TYPE); + +static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { + return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; +} + +static INLINE int is_inter_mode(PREDICTION_MODE mode) { + return mode >= INTER_MODE_START && mode < INTER_MODE_END; +} + +typedef struct { + uint8_t *plane[MAX_MB_PLANE]; + int stride[MAX_MB_PLANE]; +} BUFFER_SET; + +static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) { + return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END; +} +static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) { + return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END; +} + +static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { + static const PREDICTION_MODE lut[] = { + DC_PRED, // DC_PRED + V_PRED, // V_PRED + H_PRED, // H_PRED + D45_PRED, // D45_PRED + D135_PRED, // D135_PRED + D113_PRED, // D113_PRED + D157_PRED, // D157_PRED + D203_PRED, // D203_PRED + D67_PRED, // D67_PRED + SMOOTH_PRED, // SMOOTH_PRED + SMOOTH_V_PRED, // SMOOTH_V_PRED + SMOOTH_H_PRED, // SMOOTH_H_PRED + PAETH_PRED, // PAETH_PRED + NEARESTMV, // NEARESTMV + NEARMV, // NEARMV + GLOBALMV, // GLOBALMV + NEWMV, // NEWMV + NEARESTMV, // NEAREST_NEARESTMV + NEARMV, // NEAR_NEARMV + NEARESTMV, // NEAREST_NEWMV + NEWMV, // NEW_NEARESTMV + NEARMV, // NEAR_NEWMV + NEWMV, // NEW_NEARMV + GLOBALMV, // GLOBAL_GLOBALMV + NEWMV, // NEW_NEWMV + }; + assert(NELEMENTS(lut) == MB_MODE_COUNT); + assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode)); + return lut[mode]; +} + +static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) { + static const PREDICTION_MODE lut[] = { + MB_MODE_COUNT, // DC_PRED + MB_MODE_COUNT, // V_PRED + MB_MODE_COUNT, // H_PRED + MB_MODE_COUNT, // D45_PRED + MB_MODE_COUNT, // D135_PRED + MB_MODE_COUNT, // D113_PRED + MB_MODE_COUNT, // D157_PRED + MB_MODE_COUNT, // D203_PRED + MB_MODE_COUNT, // D67_PRED + MB_MODE_COUNT, // SMOOTH_PRED + MB_MODE_COUNT, // SMOOTH_V_PRED + MB_MODE_COUNT, // SMOOTH_H_PRED + MB_MODE_COUNT, // PAETH_PRED + MB_MODE_COUNT, // NEARESTMV + MB_MODE_COUNT, // NEARMV + MB_MODE_COUNT, // GLOBALMV + MB_MODE_COUNT, // NEWMV + NEARESTMV, // NEAREST_NEARESTMV + NEARMV, // NEAR_NEARMV + NEWMV, // NEAREST_NEWMV + NEARESTMV, // NEW_NEARESTMV + NEWMV, // NEAR_NEWMV + NEARMV, // NEW_NEARMV + GLOBALMV, // GLOBAL_GLOBALMV + NEWMV, // NEW_NEWMV + }; + assert(NELEMENTS(lut) == MB_MODE_COUNT); + assert(is_inter_compound_mode(mode)); + return lut[mode]; +} + +static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) { + return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV || + mode == NEW_NEARMV); +} + +static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) { + return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV || + mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); +} + +static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { + return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD); +} + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +typedef struct { + // Value of base colors for Y, U, and V + uint16_t palette_colors[3 * PALETTE_MAX_SIZE]; + // Number of base colors for Y (0) and UV (1) + uint8_t palette_size[2]; +} PALETTE_MODE_INFO; + +typedef struct { + FILTER_INTRA_MODE filter_intra_mode; + uint8_t use_filter_intra; +} FILTER_INTRA_MODE_INFO; + +static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = { + DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED +}; + +#if CONFIG_RD_DEBUG +#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE) +#endif + +typedef struct RD_STATS { + int rate; + int zero_rate; + int64_t dist; + // Please be careful of using rdcost, it's not guaranteed to be set all the + // time. + // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In + // these functions, make sure rdcost is always up-to-date according to + // rate/dist. + int64_t rdcost; + int64_t sse; + uint8_t skip_txfm; // sse should equal to dist when skip_txfm == 1 +#if CONFIG_RD_DEBUG + int txb_coeff_cost[MAX_MB_PLANE]; +#endif // CONFIG_RD_DEBUG +} RD_STATS; + +// This struct is used to group function args that are commonly +// sent together in functions related to interinter compound modes +typedef struct { + uint8_t *seg_mask; + int8_t wedge_index; + int8_t wedge_sign; + DIFFWTD_MASK_TYPE mask_type; + COMPOUND_TYPE type; +} INTERINTER_COMPOUND_DATA; + +#define INTER_TX_SIZE_BUF_LEN 16 +#define TXK_TYPE_BUF_LEN 64 +/*!\endcond */ + +/*! \brief Stores the prediction/txfm mode of the current coding block + */ +typedef struct MB_MODE_INFO { + /***************************************************************************** + * \name General Info of the Coding Block + ****************************************************************************/ + /**@{*/ + /*! \brief The block size of the current coding block */ + BLOCK_SIZE bsize; + /*! \brief The partition type of the current coding block. */ + PARTITION_TYPE partition; + /*! \brief The prediction mode used */ + PREDICTION_MODE mode; + /*! \brief The UV mode when intra is used */ + UV_PREDICTION_MODE uv_mode; + /*! \brief The q index for the current coding block. */ + int current_qindex; + /**@}*/ + + /***************************************************************************** + * \name Inter Mode Info + ****************************************************************************/ + /**@{*/ + /*! \brief The motion vectors used by the current inter mode */ + int_mv mv[2]; + /*! \brief The reference frames for the MV */ + MV_REFERENCE_FRAME ref_frame[2]; + /*! \brief Filter used in subpel interpolation. */ + int_interpfilters interp_filters; + /*! \brief The motion mode used by the inter prediction. */ + MOTION_MODE motion_mode; + /*! \brief Number of samples used by warp causal */ + uint8_t num_proj_ref; + /*! \brief The number of overlapped neighbors above/left for obmc/warp motion + * mode. */ + uint8_t overlappable_neighbors; + /*! \brief The parameters used in warp motion mode. */ + WarpedMotionParams wm_params; + /*! \brief The type of intra mode used by inter-intra */ + INTERINTRA_MODE interintra_mode; + /*! \brief The type of wedge used in interintra mode. */ + int8_t interintra_wedge_index; + /*! \brief Struct that stores the data used in interinter compound mode. */ + INTERINTER_COMPOUND_DATA interinter_comp; + /**@}*/ + + /***************************************************************************** + * \name Intra Mode Info + ****************************************************************************/ + /**@{*/ + /*! \brief Directional mode delta: the angle is base angle + (angle_delta * + * step). */ + int8_t angle_delta[PLANE_TYPES]; + /*! \brief The type of filter intra mode used (if applicable). */ + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + /*! \brief Chroma from Luma: Joint sign of alpha Cb and alpha Cr */ + int8_t cfl_alpha_signs; + /*! \brief Chroma from Luma: Index of the alpha Cb and alpha Cr combination */ + uint8_t cfl_alpha_idx; + /*! \brief Stores the size and colors of palette mode */ + PALETTE_MODE_INFO palette_mode_info; + /**@}*/ + + /***************************************************************************** + * \name Transform Info + ****************************************************************************/ + /**@{*/ + /*! \brief Whether to skip transforming and sending. */ + uint8_t skip_txfm; + /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */ + TX_SIZE tx_size; + /*! \brief Transform size when recursive txfm tree is on. */ + TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + /**@}*/ + + /***************************************************************************** + * \name Loop Filter Info + ****************************************************************************/ + /**@{*/ + /*! \copydoc MACROBLOCKD::delta_lf_from_base */ + int8_t delta_lf_from_base; + /*! \copydoc MACROBLOCKD::delta_lf */ + int8_t delta_lf[FRAME_LF_COUNT]; + /**@}*/ + + /***************************************************************************** + * \name Bitfield for Memory Reduction + ****************************************************************************/ + /**@{*/ + /*! \brief The segment id */ + uint8_t segment_id : 3; + /*! \brief Only valid when temporal update if off. */ + uint8_t seg_id_predicted : 1; + /*! \brief Which ref_mv to use */ + uint8_t ref_mv_idx : 2; + /*! \brief Inter skip mode */ + uint8_t skip_mode : 1; + /*! \brief Whether intrabc is used. */ + uint8_t use_intrabc : 1; + /*! \brief Indicates if masked compound is used(1) or not (0). */ + uint8_t comp_group_idx : 1; + /*! \brief Indicates whether dist_wtd_comp(0) is used or not (0). */ + uint8_t compound_idx : 1; + /*! \brief Whether to use interintra wedge */ + uint8_t use_wedge_interintra : 1; + /*! \brief CDEF strength per BLOCK_64X64 */ + int8_t cdef_strength : 4; + /**@}*/ + +#if CONFIG_RD_DEBUG + /*! \brief RD info used for debugging */ + RD_STATS rd_stats; + /*! \brief The current row in unit of 4x4 blocks for debugging */ + int mi_row; + /*! \brief The current col in unit of 4x4 blocks for debugging */ + int mi_col; +#endif +#if CONFIG_INSPECTION + /*! \brief Whether we are skipping the current rows or columns. */ + int16_t tx_skip[TXK_TYPE_BUF_LEN]; +#endif +} MB_MODE_INFO; + +/*!\cond */ + +static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) { + return mbmi->use_intrabc; +} + +static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) { + assert(mode < UV_INTRA_MODES); + static const PREDICTION_MODE uv2y[] = { + DC_PRED, // UV_DC_PRED + V_PRED, // UV_V_PRED + H_PRED, // UV_H_PRED + D45_PRED, // UV_D45_PRED + D135_PRED, // UV_D135_PRED + D113_PRED, // UV_D113_PRED + D157_PRED, // UV_D157_PRED + D203_PRED, // UV_D203_PRED + D67_PRED, // UV_D67_PRED + SMOOTH_PRED, // UV_SMOOTH_PRED + SMOOTH_V_PRED, // UV_SMOOTH_V_PRED + SMOOTH_H_PRED, // UV_SMOOTH_H_PRED + PAETH_PRED, // UV_PAETH_PRED + DC_PRED, // UV_CFL_PRED + INTRA_INVALID, // UV_INTRA_MODES + INTRA_INVALID, // UV_MODE_INVALID + }; + return uv2y[mode]; +} + +static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { + return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME; +} + +static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[1] > INTRA_FRAME; +} + +static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) { + return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^ + (mbmi->ref_frame[1] >= BWDREF_FRAME))); +} + +static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) { + static const MV_REFERENCE_FRAME lut[] = { + LAST_FRAME, // LAST_LAST2_FRAMES, + LAST_FRAME, // LAST_LAST3_FRAMES, + LAST_FRAME, // LAST_GOLDEN_FRAMES, + BWDREF_FRAME, // BWDREF_ALTREF_FRAMES, + LAST2_FRAME, // LAST2_LAST3_FRAMES + LAST2_FRAME, // LAST2_GOLDEN_FRAMES, + LAST3_FRAME, // LAST3_GOLDEN_FRAMES, + BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES, + ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES, + }; + assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); + return lut[ref_idx]; +} + +static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) { + static const MV_REFERENCE_FRAME lut[] = { + LAST2_FRAME, // LAST_LAST2_FRAMES, + LAST3_FRAME, // LAST_LAST3_FRAMES, + GOLDEN_FRAME, // LAST_GOLDEN_FRAMES, + ALTREF_FRAME, // BWDREF_ALTREF_FRAMES, + LAST3_FRAME, // LAST2_LAST3_FRAMES + GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES, + GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES, + ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES, + ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES, + }; + assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); + return lut[ref_idx]; +} + +PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi); + +PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi); + +static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi, + TransformationType type) { + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->bsize; + const int block_size_allowed = + AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; + return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION && + block_size_allowed; +} + +#if CONFIG_MISMATCH_DEBUG +static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, + int mi_row, int tx_blk_col, int tx_blk_row, + int subsampling_x, int subsampling_y) { + *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) + + (tx_blk_col << MI_SIZE_LOG2); + *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) + + (tx_blk_row << MI_SIZE_LOG2); +} +#endif + +enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision); + +struct buf_2d { + uint8_t *buf; + uint8_t *buf0; + int width; + int height; + int stride; +}; + +typedef struct eob_info { + uint16_t eob; + uint16_t max_scan_line; +} eob_info; + +typedef struct { + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]); + eob_info eob_data[MAX_MB_PLANE] + [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]); +} CB_BUFFER; + +typedef struct macroblockd_plane { + PLANE_TYPE plane_type; + int subsampling_x; + int subsampling_y; + struct buf_2d dst; + struct buf_2d pre[2]; + ENTROPY_CONTEXT *above_entropy_context; + ENTROPY_CONTEXT *left_entropy_context; + + // The dequantizers below are true dequantizers used only in the + // dequantization process. They have the same coefficient + // shift/scale as TX. + int16_t seg_dequant_QTX[MAX_SEGMENTS][2]; + // Pointer to color index map of: + // - Current coding block, on encoder side. + // - Current superblock, on decoder side. + uint8_t *color_index_map; + + // block size in pixels + uint8_t width, height; + + qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; +} MACROBLOCKD_PLANE; + +#define BLOCK_OFFSET(i) ((i) << 4) + +/*!\endcond */ + +/*!\brief Parameters related to Wiener Filter */ +typedef struct { + /*! + * Vertical filter kernel. + */ + DECLARE_ALIGNED(16, InterpKernel, vfilter); + + /*! + * Horizontal filter kernel. + */ + DECLARE_ALIGNED(16, InterpKernel, hfilter); +} WienerInfo; + +/*!\brief Parameters related to Sgrproj Filter */ +typedef struct { + /*! + * Parameter index. + */ + int ep; + + /*! + * Weights for linear combination of filtered versions + */ + int xqd[2]; +} SgrprojInfo; + +/*!\cond */ + +#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32) +#define CFL_BUF_LINE (32) +#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3) +#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4) +#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE) +typedef struct cfl_ctx { + // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid + // shifts) + uint16_t recon_buf_q3[CFL_BUF_SQUARE]; + // Q3 AC contributions (reconstructed luma pixels - tx block avg) + int16_t ac_buf_q3[CFL_BUF_SQUARE]; + + // Cache the DC_PRED when performing RDO, so it does not have to be recomputed + // for every scaling parameter + bool dc_pred_is_cached[CFL_PRED_PLANES]; + // Whether the DC_PRED cache is enabled. The DC_PRED cache is disabled when + // decoding. + bool use_dc_pred_cache; + // Only cache the first row of the DC_PRED + int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE]; + + // Height and width currently used in the CfL prediction buffer. + int buf_height, buf_width; + + int are_parameters_computed; + + // Chroma subsampling + int subsampling_x, subsampling_y; + + // Whether the reconstructed luma pixels need to be stored + int store_y; +} CFL_CTX; + +typedef struct dist_wtd_comp_params { + int use_dist_wtd_comp_avg; + int fwd_offset; + int bck_offset; +} DIST_WTD_COMP_PARAMS; + +struct scale_factors; + +/*!\endcond */ + +/*! \brief Variables related to current coding block. + * + * This is a common set of variables used by both encoder and decoder. + * Most/all of the pointers are mere pointers to actual arrays are allocated + * elsewhere. This is mostly for coding convenience. + */ +typedef struct macroblockd { + /** + * \name Position of current macroblock in mi units + */ + /**@{*/ + int mi_row; /*!< Row position in mi units. */ + int mi_col; /*!< Column position in mi units. */ + /**@}*/ + + /*! + * Same as cm->mi_params.mi_stride, copied here for convenience. + */ + int mi_stride; + + /*! + * True if current block transmits chroma information. + * More detail: + * Smallest supported block size for both luma and chroma plane is 4x4. Hence, + * in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma + * blocks smaller than 8x8 maybe combined into one chroma block. + * For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4 + * luma blocks. Then, a single chroma block of size 4x4 will cover the area of + * these four luma blocks. This is implemented in bitstream as follows: + * - There are four MB_MODE_INFO structs for the four luma blocks. + * - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit + * any information for chroma planes. + * - Last block will have is_chroma_ref = true and transmits chroma + * information for the 4x4 chroma block that covers whole 8x8 area covered by + * four luma blocks. + * Similar logic applies for chroma blocks that cover 2 or 3 luma blocks. + */ + bool is_chroma_ref; + + /*! + * Info specific to each plane. + */ + struct macroblockd_plane plane[MAX_MB_PLANE]; + + /*! + * Tile related info. + */ + TileInfo tile; + + /*! + * Appropriate offset inside cm->mi_params.mi_grid_base based on current + * mi_row and mi_col. + */ + MB_MODE_INFO **mi; + + /*! + * True if 4x4 block above the current block is available. + */ + bool up_available; + /*! + * True if 4x4 block to the left of the current block is available. + */ + bool left_available; + /*! + * True if the above chrome reference block is available. + */ + bool chroma_up_available; + /*! + * True if the left chrome reference block is available. + */ + bool chroma_left_available; + + /*! + * MB_MODE_INFO for 4x4 block to the left of the current block, if + * left_available == true; otherwise NULL. + */ + MB_MODE_INFO *left_mbmi; + /*! + * MB_MODE_INFO for 4x4 block above the current block, if + * up_available == true; otherwise NULL. + */ + MB_MODE_INFO *above_mbmi; + /*! + * Above chroma reference block if is_chroma_ref == true for the current block + * and chroma_up_available == true; otherwise NULL. + * See also: the special case logic when current chroma block covers more than + * one luma blocks in set_mi_row_col(). + */ + MB_MODE_INFO *chroma_left_mbmi; + /*! + * Left chroma reference block if is_chroma_ref == true for the current block + * and chroma_left_available == true; otherwise NULL. + * See also: the special case logic when current chroma block covers more than + * one luma blocks in set_mi_row_col(). + */ + MB_MODE_INFO *chroma_above_mbmi; + + /*! + * Appropriate offset based on current 'mi_row' and 'mi_col', inside + * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or + * 'MACROBLOCK' structs. + */ + uint8_t *tx_type_map; + /*! + * Stride for 'tx_type_map'. Note that this may / may not be same as + * 'mi_stride', depending on which actual array 'tx_type_map' points to. + */ + int tx_type_map_stride; + + /** + * \name Distance of this macroblock from frame edges in 1/8th pixel units. + */ + /**@{*/ + int mb_to_left_edge; /*!< Distance from left edge */ + int mb_to_right_edge; /*!< Distance from right edge */ + int mb_to_top_edge; /*!< Distance from top edge */ + int mb_to_bottom_edge; /*!< Distance from bottom edge */ + /**@}*/ + + /*! + * Scale factors for reference frames of the current block. + * These are pointers into 'cm->ref_scale_factors'. + */ + const struct scale_factors *block_ref_scale_factors[2]; + + /*! + * - On encoder side: points to cpi->source, which is the buffer containing + * the current *source* frame (maybe filtered). + * - On decoder side: points to cm->cur_frame->buf, which is the buffer into + * which current frame is being *decoded*. + */ + const YV12_BUFFER_CONFIG *cur_buf; + + /*! + * Entropy contexts for the above blocks. + * above_entropy_context[i][j] corresponds to above entropy context for ith + * plane and jth mi column of this *frame*, wrt current 'mi_row'. + * These are pointers into 'cm->above_contexts.entropy'. + */ + ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE]; + /*! + * Entropy contexts for the left blocks. + * left_entropy_context[i][j] corresponds to left entropy context for ith + * plane and jth mi row of this *superblock*, wrt current 'mi_col'. + * Note: These contain actual data, NOT pointers. + */ + ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE]; + + /*! + * Partition contexts for the above blocks. + * above_partition_context[i] corresponds to above partition context for ith + * mi column of this *frame*, wrt current 'mi_row'. + * This is a pointer into 'cm->above_contexts.partition'. + */ + PARTITION_CONTEXT *above_partition_context; + /*! + * Partition contexts for the left blocks. + * left_partition_context[i] corresponds to left partition context for ith + * mi row of this *superblock*, wrt current 'mi_col'. + * Note: These contain actual data, NOT pointers. + */ + PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE]; + + /*! + * Transform contexts for the above blocks. + * above_txfm_context[i] corresponds to above transform context for ith mi col + * from the current position (mi row and mi column) for this *frame*. + * This is a pointer into 'cm->above_contexts.txfm'. + */ + TXFM_CONTEXT *above_txfm_context; + /*! + * Transform contexts for the left blocks. + * left_txfm_context[i] corresponds to left transform context for ith mi row + * from the current position (mi_row and mi_col) for this *superblock*. + * This is a pointer into 'left_txfm_context_buffer'. + */ + TXFM_CONTEXT *left_txfm_context; + /*! + * left_txfm_context_buffer[i] is the left transform context for ith mi_row + * in this *superblock*. + * Behaves like an internal actual buffer which 'left_txt_context' points to, + * and never accessed directly except to fill in initial default values. + */ + TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE]; + + /** + * \name Default values for the two restoration filters for each plane. + * Default values for the two restoration filters for each plane. + * These values are used as reference values when writing the bitstream. That + * is, we transmit the delta between the actual values in + * cm->rst_info[plane].unit_info[unit_idx] and these reference values. + */ + /**@{*/ + WienerInfo wiener_info[MAX_MB_PLANE]; /*!< Defaults for Wiener filter*/ + SgrprojInfo sgrproj_info[MAX_MB_PLANE]; /*!< Defaults for SGR filter */ + /**@}*/ + + /** + * \name Block dimensions in MB_MODE_INFO units. + */ + /**@{*/ + uint8_t width; /*!< Block width in MB_MODE_INFO units */ + uint8_t height; /*!< Block height in MB_MODE_INFO units */ + /**@}*/ + + /*! + * Contains the motion vector candidates found during motion vector prediction + * process. ref_mv_stack[i] contains the candidates for ith type of + * reference frame (single/compound). The actual number of candidates found in + * ref_mv_stack[i] is stored in either dcb->ref_mv_count[i] (decoder side) + * or mbmi_ext->ref_mv_count[i] (encoder side). + */ + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + /*! + * weight[i][j] is the weight for ref_mv_stack[i][j] and used to compute the + * DRL (dynamic reference list) mode contexts. + */ + uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + + /*! + * True if this is the last vertical rectangular block in a VERTICAL or + * VERTICAL_4 partition. + */ + bool is_last_vertical_rect; + /*! + * True if this is the 1st horizontal rectangular block in a HORIZONTAL or + * HORIZONTAL_4 partition. + */ + bool is_first_horizontal_rect; + + /*! + * Counts of each reference frame in the above and left neighboring blocks. + * NOTE: Take into account both single and comp references. + */ + uint8_t neighbors_ref_counts[REF_FRAMES]; + + /*! + * Current CDFs of all the symbols for the current tile. + */ + FRAME_CONTEXT *tile_ctx; + + /*! + * Bit depth: copied from cm->seq_params->bit_depth for convenience. + */ + int bd; + + /*! + * Quantizer index for each segment (base qindex + delta for each segment). + */ + int qindex[MAX_SEGMENTS]; + /*! + * lossless[s] is true if segment 's' is coded losslessly. + */ + int lossless[MAX_SEGMENTS]; + /*! + * Q index for the coding blocks in this superblock will be stored in + * mbmi->current_qindex. Now, when cm->delta_q_info.delta_q_present_flag is + * true, mbmi->current_qindex is computed by taking 'current_base_qindex' as + * the base, and adding any transmitted delta qindex on top of it. + * Precisely, this is the latest qindex used by the first coding block of a + * non-skip superblock in the current tile; OR + * same as cm->quant_params.base_qindex (if not explicitly set yet). + * Note: This is 'CurrentQIndex' in the AV1 spec. + */ + int current_base_qindex; + + /*! + * Same as cm->features.cur_frame_force_integer_mv. + */ + int cur_frame_force_integer_mv; + + /*! + * Pointer to cm->error. + */ + struct aom_internal_error_info *error_info; + + /*! + * Same as cm->global_motion. + */ + const WarpedMotionParams *global_motion; + + /*! + * Since actual frame level loop filtering level value is not available + * at the beginning of the tile (only available during actual filtering) + * at encoder side.we record the delta_lf (against the frame level loop + * filtering level) and code the delta between previous superblock's delta + * lf and current delta lf. It is equivalent to the delta between previous + * superblock's actual lf and current lf. + */ + int8_t delta_lf_from_base; + /*! + * We have four frame filter levels for different plane and direction. So, to + * support the per superblock update, we need to add a few more params: + * 0. delta loop filter level for y plane vertical + * 1. delta loop filter level for y plane horizontal + * 2. delta loop filter level for u plane + * 3. delta loop filter level for v plane + * To make it consistent with the reference to each filter level in segment, + * we need to -1, since + * - SEG_LVL_ALT_LF_Y_V = 1; + * - SEG_LVL_ALT_LF_Y_H = 2; + * - SEG_LVL_ALT_LF_U = 3; + * - SEG_LVL_ALT_LF_V = 4; + */ + int8_t delta_lf[FRAME_LF_COUNT]; + /*! + * cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the + * current superblock has already been read from (decoder) / written to + * (encoder) the bitstream; and false otherwise. + * More detail: + * 1. CDEF strength is transmitted only once per CDEF unit, in the 1st + * non-skip coding block. So, we need this array to keep track of whether CDEF + * strengths for the given CDEF units have been transmitted yet or not. + * 2. Superblock size can be either 128x128 or 64x64, but CDEF unit size is + * fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if + * superblock size is 128x128). Hence the array size is 4. + * 3. In the current implementation, CDEF strength for this CDEF unit is + * stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside + * cm->mi_params.mi_grid_base). + */ + bool cdef_transmitted[4]; + + /*! + * Mask for this block used for compound prediction. + */ + uint8_t *seg_mask; + + /*! + * CFL (chroma from luma) related parameters. + */ + CFL_CTX cfl; + + /*! + * Offset to plane[p].color_index_map. + * Currently: + * - On encoder side, this is always 0 as 'color_index_map' is allocated per + * *coding block* there. + * - On decoder side, this may be non-zero, as 'color_index_map' is a (static) + * memory pointing to the base of a *superblock* there, and we need an offset + * to it to get the color index map for current coding block. + */ + uint16_t color_index_map_offset[2]; + + /*! + * Temporary buffer used for convolution in case of compound reference only + * for (weighted or uniform) averaging operation. + * There are pointers to actual buffers allocated elsewhere: e.g. + * - In decoder, 'pbi->td.tmp_conv_dst' or + * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and + * - In encoder, 'x->tmp_conv_dst' or + * 'cpi->tile_thr_data[t].td->mb.tmp_conv_dst'. + */ + CONV_BUF_TYPE *tmp_conv_dst; + /*! + * Temporary buffers used to build OBMC prediction by above (index 0) and left + * (index 1) predictors respectively. + * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'. + * There are pointers to actual buffers allocated elsewhere: e.g. + * - In decoder, 'pbi->td.tmp_obmc_bufs' or + * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and + * -In encoder, 'x->tmp_pred_bufs' or + * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'. + */ + uint8_t *tmp_obmc_bufs[2]; +} MACROBLOCKD; + +/*!\cond */ + +static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) { +#if CONFIG_AV1_HIGHBITDEPTH + return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; +#else + (void)xd; + return 0; +#endif +} + +static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) { +#if CONFIG_AV1_HIGHBITDEPTH + return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? CONVERT_TO_BYTEPTR(buf16) + : buf16; +#else + (void)xd; + return buf16; +#endif +} + +typedef struct BitDepthInfo { + int bit_depth; + /*! Is the image buffer high bit depth? + * Low bit depth buffer uses uint8_t. + * High bit depth buffer uses uint16_t. + * Equivalent to cm->seq_params->use_highbitdepth + */ + int use_highbitdepth_buf; +} BitDepthInfo; + +static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) { + BitDepthInfo bit_depth_info; + bit_depth_info.bit_depth = xd->bd; + bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd); + assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf, + bit_depth_info.bit_depth == 8)); + return bit_depth_info; +} + +static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_4X4: return 0; + case BLOCK_8X8: return 1; + case BLOCK_16X16: return 2; + case BLOCK_32X32: return 3; + case BLOCK_64X64: return 4; + case BLOCK_128X128: return 5; + default: return SQR_BLOCK_SIZES; + } +} + +// For a square block size 'bsize', returns the size of the sub-blocks used by +// the given partition type. If the partition produces sub-blocks of different +// sizes, then the function returns the largest sub-block size. +// Implements the Partition_Subsize lookup table in the spec (Section 9.3. +// Conversion tables). +// Note: the input block size should be square. +// Otherwise it's considered invalid. +static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (partition == PARTITION_INVALID) { + return BLOCK_INVALID; + } else { + const int sqr_bsize_idx = get_sqr_bsize_idx(bsize); + return sqr_bsize_idx >= SQR_BLOCK_SIZES + ? BLOCK_INVALID + : subsize_lookup[partition][sqr_bsize_idx]; + } +} + +static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi, + PLANE_TYPE plane_type) { + static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = { + DCT_DCT, // DC_PRED + ADST_DCT, // V_PRED + DCT_ADST, // H_PRED + DCT_DCT, // D45_PRED + ADST_ADST, // D135_PRED + ADST_DCT, // D113_PRED + DCT_ADST, // D157_PRED + DCT_ADST, // D203_PRED + ADST_DCT, // D67_PRED + ADST_ADST, // SMOOTH_PRED + ADST_DCT, // SMOOTH_V_PRED + DCT_ADST, // SMOOTH_H_PRED + ADST_ADST, // PAETH_PRED + }; + const PREDICTION_MODE mode = + (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); + assert(mode < INTRA_MODES); + return _intra_mode_to_tx_type[mode]; +} + +static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; } + +static INLINE int block_signals_txsize(BLOCK_SIZE bsize) { + return bsize > BLOCK_4X4; +} + +// Number of transform types in each set type +static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = { + 1, 2, 5, 7, 12, 16, +}; + +static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = { + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +}; + +// The bitmask corresponds to the transform types as defined in +// enums.h TX_TYPE enumeration type. Setting the bit 0 means to disable +// the use of the corresponding transform type in that table. +// The av1_derived_intra_tx_used_flag table is used when +// use_reduced_intra_txset is set to 2, where one only searches +// the transform types derived from residual statistics. +static const uint16_t av1_derived_intra_tx_used_flag[INTRA_MODES] = { + 0x0209, // DC_PRED: 0000 0010 0000 1001 + 0x0403, // V_PRED: 0000 0100 0000 0011 + 0x0805, // H_PRED: 0000 1000 0000 0101 + 0x020F, // D45_PRED: 0000 0010 0000 1111 + 0x0009, // D135_PRED: 0000 0000 0000 1001 + 0x0009, // D113_PRED: 0000 0000 0000 1001 + 0x0009, // D157_PRED: 0000 0000 0000 1001 + 0x0805, // D203_PRED: 0000 1000 0000 0101 + 0x0403, // D67_PRED: 0000 0100 0000 0011 + 0x0205, // SMOOTH_PRED: 0000 0010 0000 1001 + 0x0403, // SMOOTH_V_PRED: 0000 0100 0000 0011 + 0x0805, // SMOOTH_H_PRED: 0000 1000 0000 0101 + 0x0209, // PAETH_PRED: 0000 0010 0000 1001 +}; + +static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = { + 0x080F, // DC_PRED: 0000 1000 0000 1111 + 0x040F, // V_PRED: 0000 0100 0000 1111 + 0x080F, // H_PRED: 0000 1000 0000 1111 + 0x020F, // D45_PRED: 0000 0010 0000 1111 + 0x080F, // D135_PRED: 0000 1000 0000 1111 + 0x040F, // D113_PRED: 0000 0100 0000 1111 + 0x080F, // D157_PRED: 0000 1000 0000 1111 + 0x080F, // D203_PRED: 0000 1000 0000 1111 + 0x040F, // D67_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111 + 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111 + 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110 +}; + +static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { + 0x0001, // 0000 0000 0000 0001 + 0x0201, // 0000 0010 0000 0001 + 0x020F, // 0000 0010 0000 1111 + 0x0E0F, // 0000 1110 0000 1111 + 0x0FFF, // 0000 1111 1111 1111 + 0xFFFF, // 1111 1111 1111 1111 +}; + +static const TxSetType av1_ext_tx_set_lookup[2][2] = { + { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX }, + { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT }, +}; + +static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; + if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY; + if (tx_size_sqr_up == TX_32X32) + return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY; + if (use_reduced_set) + return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX; + const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size]; + return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16]; +} + +// Maps tx set types to the indices. +static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = { + { // Intra + 0, -1, 2, 1, -1, -1 }, + { // Inter + 0, 3, -1, -1, 2, 1 }, +}; + +static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const TxSetType set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); + return ext_tx_set_index[is_inter][set_type]; +} + +static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const int set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); + return av1_num_ext_tx_set[set_type]; +} + +#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2)) +#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2)) + +static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) { + const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize]; + if (bsize == BLOCK_4X4) + return AOMMIN(max_txsize_lookup[bsize], largest_tx_size); + if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size) + return max_rect_tx_size; + else + return largest_tx_size; +} + +static const uint8_t mode_to_angle_map[INTRA_MODES] = { + 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, +}; + +// Converts block_index for given transform size to index of the block in raster +// order. +static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size, + int block_idx) { + // For transform size 4x8, the possible block_idx values are 0 & 2, because + // block_idx values are incremented in steps of size 'tx_width_unit x + // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to + // block number 1 in raster order, inside an 8x8 MI block. + // For any other transform size, the two indices are equivalent. + return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx; +} + +// Inverse of above function. +// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now. +static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size, + int raster_order) { + assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4); + // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4. + return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0; +} + +static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd, + TX_SIZE tx_size, + int use_screen_content_tools) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y || + xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 || + use_screen_content_tools) + return DEFAULT_INTER_TX_TYPE; + + return intra_mode_to_tx_type(mbmi, plane_type); +} + +// Implements the get_plane_residual_size() function in the spec (Section +// 5.11.38. Get plane residual size function). +static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + int subsampling_x, + int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y]; +} + +/* + * Logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ +static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row, + int blk_col) { + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1, + }; + const int index = + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); + assert(index < INTER_TX_SIZE_BUF_LEN); + return index; +} + +#if CONFIG_INSPECTION +/* + * Here is the logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ +static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row, + int blk_col) { + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2, + }; + const int index = + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); + assert(index < TXK_TYPE_BUF_LEN); + return index; +} +#endif // CONFIG_INSPECTION + +static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row, + int blk_col, TX_SIZE tx_size, + TX_TYPE tx_type) { + const int stride = xd->tx_type_map_stride; + xd->tx_type_map[blk_row * stride + blk_col] = tx_type; + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + // The 16x16 unit is due to the constraint from tx_64x64 which sets the + // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block + // size, the constraint takes effect in 32x16 / 16x32 size too. To solve + // the intricacy, cover all the 16x16 units inside a 64 level transform. + if (txw == tx_size_wide_unit[TX_64X64] || + txh == tx_size_high_unit[TX_64X64]) { + const int tx_unit = tx_size_wide_unit[TX_16X16]; + for (int idy = 0; idy < txh; idy += tx_unit) { + for (int idx = 0; idx < txw; idx += tx_unit) { + xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type; + } + } + } +} + +static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd, + PLANE_TYPE plane_type, int blk_row, + int blk_col, TX_SIZE tx_size, + int reduced_tx_set) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { + return DCT_DCT; + } + + TX_TYPE tx_type; + if (plane_type == PLANE_TYPE_Y) { + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + } else { + if (is_inter_block(mbmi)) { + // scale back to y plane's coordinate + const struct macroblockd_plane *const pd = &xd->plane[plane_type]; + blk_row <<= pd->subsampling_y; + blk_col <<= pd->subsampling_x; + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + } else { + // In intra mode, uv planes don't share the same prediction mode as y + // plane, so the tx_type should not be shared + tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV); + } + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set); + if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT; + } + assert(tx_type < TX_TYPES); + assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), + reduced_tx_set)][tx_type]); + return tx_type; +} + +void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, + const int num_planes); + +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * int depth = 0; + * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + */ +static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) { + static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + return bsize_to_max_depth_table[bsize]; +} + +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * assert(tx_size != TX_4X4); + * int depth = 0; + * while (tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + * assert(depth < 10); + */ +static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4, + }; + const int depth = bsize_to_tx_size_depth_table[bsize]; + assert(depth <= MAX_TX_CATS); + return depth - 1; +} + +static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) { + TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + TX_SIZE tx_size = max_tx_size; + for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size]; + return tx_size; +} + +static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) { + switch (tx_size) { + case TX_64X64: + case TX_64X32: + case TX_32X64: return TX_32X32; + case TX_64X16: return TX_32X16; + case TX_16X64: return TX_16X32; + default: return tx_size; + } +} + +static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize]; + return av1_get_adjusted_tx_size(uv_tx); +} + +static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + if (xd->lossless[mbmi->segment_id]) return TX_4X4; + if (plane == 0) return mbmi->tx_size; + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y); +} + +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes); + +void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes); + +void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes); + +typedef void (*foreach_transformed_block_visitor)(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); + +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff); + +#define MAX_INTERINTRA_SB_SQUARE 32 * 32 +static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) { + return (mbmi->ref_frame[0] > INTRA_FRAME && + mbmi->ref_frame[1] == INTRA_FRAME); +} + +static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) { + return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32); +} + +static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) { + return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END); +} + +static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) { + return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME); +} + +static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) { + return is_interintra_allowed_bsize(mbmi->bsize) && + is_interintra_allowed_mode(mbmi->mode) && + is_interintra_allowed_ref(mbmi->ref_frame); +} + +static INLINE int is_interintra_allowed_bsize_group(int group) { + int i; + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + if (size_group_lookup[i] == group && + is_interintra_allowed_bsize((BLOCK_SIZE)i)) { + return 1; + } + } + return 0; +} + +static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[0] > INTRA_FRAME && + mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi); +} + +static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; + const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize]; + if (plane == 0) return max_txsize; // luma + return av1_get_adjusted_tx_size(max_txsize); // chroma +} + +static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; +} + +static INLINE int is_motion_variation_allowed_compound( + const MB_MODE_INFO *mbmi) { + return !has_second_ref(mbmi); +} + +// input: log2 of length, 0(4), 1(8), ... +static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 }; + +static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) { + return mbmi->overlappable_neighbors != 0; +} + +static INLINE MOTION_MODE +motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, int allow_warped_motion) { + if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; + if (xd->cur_frame_force_integer_mv == 0) { + const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype; + if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION; + } + if (is_motion_variation_allowed_bsize(mbmi->bsize) && + is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME && + is_motion_variation_allowed_compound(mbmi)) { + assert(!has_second_ref(mbmi)); + if (mbmi->num_proj_ref >= 1 && allow_warped_motion && + !xd->cur_frame_force_integer_mv && + !av1_is_scaled(xd->block_ref_scale_factors[0])) { + return WARPED_CAUSAL; + } + return OBMC_CAUSAL; + } + return SIMPLE_TRANSLATION; +} + +static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) { + return (is_inter_block(mbmi)); +} + +static INLINE int av1_allow_palette(int allow_screen_content_tools, + BLOCK_SIZE sb_type) { + assert(sb_type < BLOCK_SIZES_ALL); + return allow_screen_content_tools && + block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH && + block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT && + sb_type >= BLOCK_8X8; +} + +// Returns sub-sampled dimensions of the given block. +// The output values for 'rows_within_bounds' and 'cols_within_bounds' will +// differ from 'height' and 'width' when part of the block is outside the +// right +// and/or bottom image boundary. +static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane, + const MACROBLOCKD *xd, int *width, + int *height, + int *rows_within_bounds, + int *cols_within_bounds) { + const int block_height = block_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + const int block_rows = (xd->mb_to_bottom_edge >= 0) + ? block_height + : (xd->mb_to_bottom_edge >> 3) + block_height; + const int block_cols = (xd->mb_to_right_edge >= 0) + ? block_width + : (xd->mb_to_right_edge >> 3) + block_width; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0)); + assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0)); + assert(block_width >= block_cols); + assert(block_height >= block_rows); + const int plane_block_width = block_width >> pd->subsampling_x; + const int plane_block_height = block_height >> pd->subsampling_y; + // Special handling for chroma sub8x8. + const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4; + const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4; + if (width) { + *width = plane_block_width + 2 * is_chroma_sub8_x; + assert(*width >= 0); + } + if (height) { + *height = plane_block_height + 2 * is_chroma_sub8_y; + assert(*height >= 0); + } + if (rows_within_bounds) { + *rows_within_bounds = + (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y; + assert(*rows_within_bounds >= 0); + } + if (cols_within_bounds) { + *cols_within_bounds = + (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x; + assert(*cols_within_bounds >= 0); + } +} + +/* clang-format off */ +// Pointer to a three-dimensional array whose first dimension is PALETTE_SIZES. +typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; +// Pointer to a const three-dimensional array whose first dimension is +// PALETTE_SIZES. +typedef const int (*ColorCost)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS]; +/* clang-format on */ + +typedef struct { + int rows; + int cols; + int n_colors; + int plane_width; + int plane_height; + uint8_t *color_map; + MapCdf map_cdf; + ColorCost color_cost; +} Av1ColorMapParam; + +static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int ref; + + // First check if all modes are GLOBALMV + if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0; + + if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2) + return 0; + + // Now check if all global motion is non translational + for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0; + } + return 1; +} + +static INLINE PLANE_TYPE get_plane_type(int plane) { + return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV; +} + +static INLINE int av1_get_max_eob(TX_SIZE tx_size) { + if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) { + return 1024; + } + if (tx_size == TX_16X64 || tx_size == TX_64X16) { + return 512; + } + return tx_size_2d[tx_size]; +} + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_BLOCKD_H_ diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c new file mode 100644 index 0000000000..12e9545441 --- /dev/null +++ b/third_party/aom/av1/common/cdef.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef.h" +#include "av1/common/cdef_block.h" +#include "av1/common/reconinter.h" +#include "av1/common/thread_common.h" + +static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col, + int mi_stride) { + MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col; + for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) { + for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) { + if (!mbmi[c]->skip_txfm) return 0; + } + } + + return 1; +} + +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bs) { + MB_MODE_INFO **grid = mi_params->mi_grid_base; + int maxc = mi_params->mi_cols - mi_col; + int maxr = mi_params->mi_rows - mi_row; + + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) + maxc = AOMMIN(maxc, MI_SIZE_128X128); + else + maxc = AOMMIN(maxc, MI_SIZE_64X64); + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) + maxr = AOMMIN(maxr, MI_SIZE_128X128); + else + maxr = AOMMIN(maxr, MI_SIZE_64X64); + + const int r_step = 2; // mi_size_high[BLOCK_8X8] + const int c_step = 2; // mi_size_wide[BLOCK_8X8] + const int r_shift = 1; + const int c_shift = 1; + int count = 0; + for (int r = 0; r < maxr; r += r_step) { + for (int c = 0; c < maxc; c += c_step) { + if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, + mi_params->mi_stride)) { + dlist[count].by = r >> r_shift; + dlist[count].bx = c >> c_shift; + count++; + } + } + } + return count; +} + +void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, int width, + int height) { + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize) { + const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; + cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); +} + +void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize) { + const uint16_t *base = + &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; + cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); +} + +void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, + int dstride, const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, int hsize) { + if (cm->seq_params->use_highbitdepth) { + av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset, + sstride, vsize, hsize); + } else { + av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset, + sstride, vsize, hsize); + } +} + +static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src, + int sstride, int v, int h) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +// Prepares intermediate input buffer for CDEF. +// Inputs: +// cm: Pointer to common structure. +// fb_info: Pointer to the CDEF block-level parameter structure. +// colbuf: Left column buffer for CDEF. +// cdef_left: Left block is filtered or not. +// fbc, fbr: col and row index of a block. +// plane: plane index Y/CB/CR. +// Returns: +// Nothing will be returned. +static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info, + uint16_t **const colbuf, const int cdef_left, + int fbc, int fbr, int plane) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + uint16_t *src = fb_info->src; + const int luma_stride = + ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4); + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + int cstart = 0; + if (!cdef_left) cstart = -CDEF_HBORDER; + int rend, cend; + const int nhb = + AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + const int nvb = + AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + const int hsize = nhb << fb_info->mi_wide_l2; + const int vsize = nvb << fb_info->mi_high_l2; + const uint16_t *top_linebuf = fb_info->top_linebuf[plane]; + const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane]; + const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE; + const int stride = + luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x); + + if (fbc == nhfb - 1) + cend = hsize; + else + cend = hsize + CDEF_HBORDER; + + if (fbr == nvfb - 1) + rend = vsize; + else + rend = vsize + CDEF_VBORDER; + + /* Copy in the pixels we need from the current superblock for + deringing.*/ + av1_cdef_copy_sb8_16( + cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], + CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart, + fb_info->dst_stride, vsize, cend - cstart); + + /* Copy in the pixels we need for the current superblock from bottom buffer.*/ + if (fbr < nvfb - 1) { + copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize); + } else { + fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, + hsize, CDEF_VERY_LARGE); + } + if (fbr < nvfb - 1 && fbc > 0) { + copy_rect(&src[bot_offset], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride, + CDEF_VBORDER, CDEF_HBORDER); + } else { + fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (fbr < nvfb - 1 && fbc < nhfb - 1) { + copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, + CDEF_HBORDER); + } else { + fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, + CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + + /* Copy in the pixels we need from the current superblock from top buffer.*/ + if (fbr > 0) { + copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset], + stride, CDEF_VBORDER, hsize); + } else { + fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, + CDEF_VERY_LARGE); + } + if (fbr > 0 && fbc > 0) { + copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER], + stride, CDEF_VBORDER, CDEF_HBORDER); + } else { + fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (fbr > 0 && fbc < nhfb - 1) { + copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, + &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, + CDEF_HBORDER); + } else { + fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, + CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (cdef_left) { + /* If we deringed the superblock on the left then we need to copy in + saved pixels. */ + copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER, + rend + CDEF_VBORDER, CDEF_HBORDER); + } + /* Saving pixels in case we need to dering the superblock on the + right. */ + copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, + rend + CDEF_VBORDER, CDEF_HBORDER); + + if (fb_info->frame_boundary[LEFT]) { + fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (fb_info->frame_boundary[RIGHT]) { + fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, + vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } +} + +static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane, + uint8_t use_highbitdepth) { + int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset; + if (use_highbitdepth) { + av1_cdef_filter_fb( + NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride, + &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], + fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane, + fb_info->dlist, fb_info->cdef_count, fb_info->level, + fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift); + } else { + av1_cdef_filter_fb( + fb_info->dst + offset, NULL, fb_info->dst_stride, + &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], + fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane, + fb_info->dlist, fb_info->cdef_count, fb_info->level, + fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift); + } +} + +// Initializes block-level parameters for CDEF. +static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, int *level, + int *sec_strength, int fbc, int fbr, + int plane) { + const PLANE_TYPE plane_type = get_plane_type(plane); + fb_info->level = level[plane_type]; + fb_info->sec_strength = sec_strength[plane_type]; + fb_info->dst = xd->plane[plane].dst.buf; + fb_info->dst_stride = xd->plane[plane].dst.stride; + + fb_info->xdec = xd->plane[plane].subsampling_x; + fb_info->ydec = xd->plane[plane].subsampling_y; + fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x; + fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2; + fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2; +} + +static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, uint16_t **const colbuf, + int *cdef_left, int fbc, int fbr) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mbmi_cdef_strength = + mi_params + ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] + ->cdef_strength; + const int num_planes = av1_num_planes(cm); + int is_zero_level[PLANE_TYPES] = { 1, 1 }; + int level[PLANE_TYPES] = { 0 }; + int sec_strength[PLANE_TYPES] = { 0 }; + const CdefInfo *const cdef_info = &cm->cdef_info; + + if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] == NULL || + mbmi_cdef_strength == -1) { + av1_zero_array(cdef_left, num_planes); + return; + } + + // Compute level and secondary strength for planes + level[PLANE_TYPE_Y] = + cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_Y] = + cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3; + is_zero_level[PLANE_TYPE_Y] = + (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0); + + if (num_planes > 1) { + level[PLANE_TYPE_UV] = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_UV] = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3; + is_zero_level[PLANE_TYPE_UV] = + (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0); + } + + if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) { + av1_zero_array(cdef_left, num_planes); + return; + } + + fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64, + fbc * MI_SIZE_64X64, + fb_info->dlist, BLOCK_64X64); + if (!fb_info->cdef_count) { + av1_zero_array(cdef_left, num_planes); + return; + } + + for (int plane = 0; plane < num_planes; plane++) { + // Do not skip cdef filtering for luma plane as filter direction is + // computed based on luma. + if (plane && is_zero_level[get_plane_type(plane)]) { + cdef_left[plane] = 0; + continue; + } + cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane); + cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane); + cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth); + cdef_left[plane] = 1; + } +} + +// Initializes row-level parameters for CDEF frame. +void av1_cdef_init_fb_row(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr) { + (void)cdef_sync; + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + const bool ping_pong = fbr & 1; + // for the current filter block, it's top left corner mi structure (mi_tl) + // is first accessed to check whether the top and left boundaries are + // frame boundaries. Then bottom-left and top-right mi structures are + // accessed to check whether the bottom and right boundaries + // (respectively) are frame boundaries. + // + // Note that we can't just check the bottom-right mi structure - eg. if + // we're at the right-hand edge of the frame but not the bottom, then + // the bottom-right mi is NULL but the bottom-left is not. + fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; + if (fbr != nvfb - 1) + fb_info->frame_boundary[BOTTOM] = + (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; + else + fb_info->frame_boundary[BOTTOM] = 1; + + fb_info->src = src; + fb_info->damping = cm->cdef_info.cdef_damping; + fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + av1_zero(fb_info->dir); + av1_zero(fb_info->var); + + for (int plane = 0; plane < num_planes; plane++) { + const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + const int stride = luma_stride >> xd->plane[plane].subsampling_x; + // here ping-pong buffers are maintained for top linebuf + // to avoid linebuf over-write by consecutive row. + uint16_t *const top_linebuf = + &linebuf[plane][ping_pong * CDEF_VBORDER * stride]; + fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride]; + + if (fbr != nvfb - 1) // top line buffer copy + av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf, + offset - CDEF_VBORDER, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + fb_info->top_linebuf[plane] = + &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride]; + + if (fbr != nvfb - 1) // bottom line buffer copy + av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride, + xd->plane[plane].dst.buf, offset, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + } +} + +void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, + uint16_t **const linebuf, uint16_t **const colbuf, + uint16_t *const src, int fbr, + cdef_init_fb_row_t cdef_init_fb_row_fn, + struct AV1CdefSyncData *const cdef_sync, + struct aom_internal_error_info *error_info) { + // TODO(aomedia:3276): Pass error_info to the low-level functions as required + // in future to handle error propagation. + (void)error_info; + CdefBlockInfo fb_info; + int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 }; + const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr); +#if CONFIG_MULTITHREAD + if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) { + pthread_mutex_lock(cdef_sync->mutex_); + const bool cdef_mt_exit = cdef_sync->cdef_mt_exit; + pthread_mutex_unlock(cdef_sync->mutex_); + // Exit in case any worker has encountered an error. + if (cdef_mt_exit) return; + } +#endif + for (int fbc = 0; fbc < nhfb; fbc++) { + fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0; + if (fbc != nhfb - 1) + fb_info.frame_boundary[RIGHT] = + (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0; + else + fb_info.frame_boundary[RIGHT] = 1; + cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr); + } +} + +// Perform CDEF on input frame. +// Inputs: +// frame: Pointer to input frame buffer. +// cm: Pointer to common structure. +// xd: Pointer to common current coding block structure. +// Returns: +// Nothing will be returned. +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, + MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) { + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + + for (int fbr = 0; fbr < nvfb; fbr++) + av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf, + cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL, + xd->error_info); +} diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h new file mode 100644 index 0000000000..a56cd9db4a --- /dev/null +++ b/third_party/aom/av1/common/cdef.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_CDEF_H_ +#define AOM_AV1_COMMON_CDEF_H_ + +#define CDEF_STRENGTH_BITS 6 + +#define CDEF_PRI_STRENGTHS 16 +#define CDEF_SEC_STRENGTHS 4 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef_block.h" + +enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY); + +struct AV1CdefSyncData; + +/*!\brief Parameters related to CDEF Block */ +typedef struct { + uint16_t *src; /*!< CDEF intermediate buffer */ + uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */ + uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */ + uint8_t *dst; /*!< CDEF destination buffer */ + cdef_list + dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */ + + int xdec; /*!< Sub-sampling X */ + int ydec; /*!< Sub-sampling X */ + int mi_wide_l2; /*!< Pixels per mi unit in width */ + int mi_high_l2; /*!< Pixels per mi unit in height */ + int frame_boundary[BOUNDARIES]; /*!< frame boundaries */ + + int damping; /*!< CDEF damping factor */ + int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */ + int level; /*!< CDEF filtering level */ + int sec_strength; /*!< CDEF secondary strength */ + int cdef_count; /*!< Number of CDEF sub-blocks in superblock */ + int dir[CDEF_NBLOCKS] + [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/ + int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */ + + int dst_stride; /*!< CDEF destination buffer stride */ + int coffset; /*!< current superblock offset in a row */ + int roffset; /*!< current row offset */ +} CdefBlockInfo; + +static INLINE int sign(int i) { return i < 0 ? -1 : 1; } + +static INLINE int constrain(int diff, int threshold, int damping) { + if (!threshold) return 0; + + const int shift = AOMMAX(0, damping - get_msb(threshold)); + return sign(diff) * + AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift))); +} + +#ifdef __cplusplus +extern "C" { +#endif + +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bsize); + +typedef void (*cdef_init_fb_row_t)( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); + +/*!\brief Function for applying CDEF to a frame + * + * \ingroup in_loop_cdef + * This function applies CDEF to a frame. + * + * \param[in, out] frame Compressed frame buffer + * \param[in, out] cm Pointer to top level common structure + * \param[in] xd Pointer to common current coding block structure + * \param[in] cdef_init_fb_row_fn Function Pointer + * + * \remark Nothing is returned. Instead, the filtered frame is output in + * \c frame. + */ +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, + MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn); +void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, + uint16_t **const linebuf, uint16_t **const colbuf, + uint16_t *const src, int fbr, + cdef_init_fb_row_t cdef_init_fb_row_fn, + struct AV1CdefSyncData *const cdef_sync, + struct aom_internal_error_info *error_info); +void av1_cdef_init_fb_row(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_COMMON_CDEF_H_ diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c new file mode 100644 index 0000000000..ce7039f374 --- /dev/null +++ b/third_party/aom/av1/common/cdef_block.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/cdef.h" +/* +This is Cdef_Directions (section 7.15.3) with 2 padding entries at the +beginning and end of the table. The cdef direction range is [0, 7] and the +first index is offset +/-2. This removes the need to constrain the first +index to the same range using e.g., & 7. +*/ +DECLARE_ALIGNED(16, const int, cdef_directions_padded[12][2]) = { + /* Padding: cdef_directions[6] */ + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, + /* Padding: cdef_directions[7] */ + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, + + /* Begin cdef_directions */ + { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 }, + { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, + /* End cdef_directions */ + + /* Padding: cdef_directions[0] */ + { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, + /* Padding: cdef_directions[1] */ + { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, +}; + +const int (*const cdef_directions)[2] = cdef_directions_padded + 2; + +/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. + The search minimizes the weighted variance along all the lines in a + particular direction, i.e. the squared error between the input and a + "predicted" block where each pixel is replaced by the average along a line + in a particular direction. Since each direction have the same sum(x^2) term, + that term is never computed. See Section 2, step 2, of: + http://jmvalin.ca/notes/intra_paint.pdf */ +int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + int i; + int32_t cost[8] = { 0 }; + int partial[8][15] = { { 0 } }; + int32_t best_cost = 0; + int best_dir = 0; + /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. + The output is then 840 times larger, but we don't care for finding + the max. */ + static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 }; + for (i = 0; i < 8; i++) { + int j; + for (j = 0; j < 8; j++) { + int x; + /* We subtract 128 here to reduce the maximum range of the squared + partial sums. */ + x = (img[i * stride + j] >> coeff_shift) - 128; + partial[0][i + j] += x; + partial[1][i + j / 2] += x; + partial[2][i] += x; + partial[3][3 + i - j / 2] += x; + partial[4][7 + i - j] += x; + partial[5][3 - i / 2 + j] += x; + partial[6][j] += x; + partial[7][i / 2 + j] += x; + } + } + for (i = 0; i < 8; i++) { + cost[2] += partial[2][i] * partial[2][i]; + cost[6] += partial[6][i] * partial[6][i]; + } + cost[2] *= div_table[8]; + cost[6] *= div_table[8]; + for (i = 0; i < 7; i++) { + cost[0] += (partial[0][i] * partial[0][i] + + partial[0][14 - i] * partial[0][14 - i]) * + div_table[i + 1]; + cost[4] += (partial[4][i] * partial[4][i] + + partial[4][14 - i] * partial[4][14 - i]) * + div_table[i + 1]; + } + cost[0] += partial[0][7] * partial[0][7] * div_table[8]; + cost[4] += partial[4][7] * partial[4][7] * div_table[8]; + for (i = 1; i < 8; i += 2) { + int j; + for (j = 0; j < 4 + 1; j++) { + cost[i] += partial[i][3 + j] * partial[i][3 + j]; + } + cost[i] *= div_table[8]; + for (j = 0; j < 4 - 1; j++) { + cost[i] += (partial[i][j] * partial[i][j] + + partial[i][10 - j] * partial[i][10 - j]) * + div_table[2 * j + 2]; + } + } + for (i = 0; i < 8; i++) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + best_dir = i; + } + } + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var = best_cost - cost[(best_dir + 4) & 7]; + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var >>= 10; + return best_dir; +} + +void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var1, int32_t *var2, + int coeff_shift, int *out1, int *out2) { + *out1 = cdef_find_dir_c(img1, stride, var1, coeff_shift); + *out2 = cdef_find_dir_c(img2, stride, var2, coeff_shift); +} + +const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; +const int cdef_sec_taps[2] = { 2, 1 }; + +/* Smooth in the direction detected. */ +static void cdef_filter_block_internal( + uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, int pri_damping, + int sec_damping, int coeff_shift, int block_width, int block_height, + int enable_primary, int enable_secondary) { + const int clipping_required = (enable_primary && enable_secondary); + int i, j, k; + const int s = CDEF_BSTRIDE; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + int16_t sum = 0; + int16_t y; + int16_t x = in[i * s + j]; + int max = x; + int min = x; + for (k = 0; k < 2; k++) { + if (enable_primary) { + int16_t p0 = in[i * s + j + cdef_directions[dir][k]]; + int16_t p1 = in[i * s + j - cdef_directions[dir][k]]; + sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping); + sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping); + if (clipping_required) { + if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max); + if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max); + min = AOMMIN(p0, min); + min = AOMMIN(p1, min); + } + } + if (enable_secondary) { + int16_t s0 = in[i * s + j + cdef_directions[dir + 2][k]]; + int16_t s1 = in[i * s + j - cdef_directions[dir + 2][k]]; + int16_t s2 = in[i * s + j + cdef_directions[dir - 2][k]]; + int16_t s3 = in[i * s + j - cdef_directions[dir - 2][k]]; + if (clipping_required) { + if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max); + if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max); + if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max); + if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max); + min = AOMMIN(s0, min); + min = AOMMIN(s1, min); + min = AOMMIN(s2, min); + min = AOMMIN(s3, min); + } + sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping); + } + } + y = ((int16_t)x + ((8 + sum - (sum < 0)) >> 4)); + if (clipping_required) { + y = clamp(y, min, max); + } + + if (dst8) + dst8[i * dstride + j] = (uint8_t)y; + else + dst16[i * dstride + j] = (uint16_t)y; + } + } +} + +void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/1); +} + +void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/0); +} + +void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/1); +} + +void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/0); +} + +void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/1); +} + +void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/0); +} + +void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/1); +} + +void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/0); +} + +/* Compute the primary filter strength for an 8x8 block based on the + directional variance difference. A high variance difference means + that we have a highly directional pattern (e.g. a high contrast + edge), so we can apply more deringing. A low variance means that we + either have a low contrast edge, or a non-directional texture, so + we want to be careful not to blur. */ +static INLINE int adjust_strength(int strength, int32_t var) { + const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0; + /* We use the variance of 8x8 blocks to adjust the strength. */ + return var ? (strength * (4 + i) + 8) >> 4 : 0; +} + +static AOM_INLINE void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], + int cdef_count, int coeff_shift, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) { + int bi; + + // Find direction of two 8x8 blocks together. + for (bi = 0; bi < cdef_count - 1; bi += 2) { + const int by = dlist[bi].by; + const int bx = dlist[bi].bx; + const int by2 = dlist[bi + 1].by; + const int bx2 = dlist[bi + 1].bx; + const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx; + const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2; + cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx], + &var[by2][bx2], coeff_shift, &dir[by][bx], + &dir[by2][bx2]); + } + + // Process remaining 8x8 blocks here. One 8x8 at a time. + if (cdef_count % 2) { + const int by = dlist[bi].by; + const int bx = dlist[bi].bx; + dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx], + CDEF_BSTRIDE, &var[by][bx], coeff_shift); + } +} + +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + const uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift) { + int bi; + int bx; + int by; + const int pri_strength = level << coeff_shift; + sec_strength <<= coeff_shift; + damping += coeff_shift - (pli != AOM_PLANE_Y); + const int bw_log2 = 3 - xdec; + const int bh_log2 = 3 - ydec; + if (dirinit && pri_strength == 0 && sec_strength == 0) { + // If we're here, both primary and secondary strengths are 0, and + // we still haven't written anything to y[] yet, so we just copy + // the input to y[]. This is necessary only for av1_cdef_search() + // and only av1_cdef_search() sets dirinit. + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + // TODO(stemidts/jmvalin): SIMD optimisations + for (int iy = 0; iy < 1 << bh_log2; iy++) { + memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)], + &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)], + ((size_t)1 << bw_log2) * sizeof(*dst16)); + } + } + return; + } + + if (pli == 0) { + if (!dirinit || !*dirinit) { + aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir); + if (dirinit) *dirinit = 1; + } + } + if (pli == 1 && xdec != ydec) { + for (bi = 0; bi < cdef_count; bi++) { + static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 }; + static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 }; + by = dlist[bi].by; + bx = dlist[bi].bx; + dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]]; + } + } + + if (dst8) { + const int block_width = 8 >> xdec; + const int block_height = 8 >> ydec; + /* + * strength_index == 0 : enable_primary = 1, enable_secondary = 1 + * strength_index == 1 : enable_primary = 1, enable_secondary = 0 + * strength_index == 2 : enable_primary = 0, enable_secondary = 1 + * strength_index == 3 : enable_primary = 0, enable_secondary = 0 + */ + const cdef_filter_block_func cdef_filter_fn[4] = { + cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3 + }; + + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + const int t = + (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); + const int strength_index = (sec_strength == 0) | ((t == 0) << 1); + + cdef_filter_fn[strength_index]( + &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, + sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, + coeff_shift, block_width, block_height); + } + } else { + const int block_width = 8 >> xdec; + const int block_height = 8 >> ydec; + /* + * strength_index == 0 : enable_primary = 1, enable_secondary = 1 + * strength_index == 1 : enable_primary = 1, enable_secondary = 0 + * strength_index == 2 : enable_primary = 0, enable_secondary = 1 + * strength_index == 3 : enable_primary = 0, enable_secondary = 0 + */ + const cdef_filter_block_func cdef_filter_fn[4] = { + cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3 + }; + + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + const int t = + (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); + const int strength_index = (sec_strength == 0) | ((t == 0) << 1); + + cdef_filter_fn[strength_index]( + &dst16[dirinit ? bi << (bw_log2 + bh_log2) + : (by << bh_log2) * dstride + (bx << bw_log2)], + dirinit ? 1 << bw_log2 : dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, + sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, + coeff_shift, block_width, block_height); + } + } +} diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h new file mode 100644 index 0000000000..b5e4f124ae --- /dev/null +++ b/third_party/aom/av1/common/cdef_block.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_H_ + +#include "aom_dsp/odintrin.h" + +#define CDEF_BLOCKSIZE 64 +#define CDEF_BLOCKSIZE_LOG2 6 +#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8) +#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2) + +/* We need to buffer two vertical lines. */ +#define CDEF_VBORDER (2) +/* We only need to buffer three horizontal pixels too, but let's align to + 16 bytes (8 x 16 bits) to make vectorization easier. */ +#define CDEF_HBORDER (8) +#define CDEF_BSTRIDE \ + ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3) + +#define CDEF_VERY_LARGE (0x4000) +#define CDEF_INBUF_SIZE \ + (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER)) + +extern const int cdef_pri_taps[2][2]; +extern const int cdef_sec_taps[2]; +extern const int (*const cdef_directions)[2]; + +typedef struct { + uint8_t by; + uint8_t bx; +} cdef_list; + +typedef void (*cdef_filter_block_func)(void *dest, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height); + +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + const uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift); + +static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h, + uint16_t x) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = x; + } + } +} +#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h new file mode 100644 index 0000000000..5c62201f1e --- /dev/null +++ b/third_party/aom/av1/common/cdef_block_simd.h @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ + +#include "config/av1_rtcd.h" + +#include "av1/common/cdef_block.h" + +/* partial A is a 16-bit vector of the form: + [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: + [0 y1 y2 y3 y4 y5 y6 y7]. + This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... + (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 + and const2. */ +static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1, + v128 const2) { + v128 tmp; + /* Reverse partial B. */ + partialb = v128_shuffle_8( + partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); + /* Interleave the x and y values of identical indices and pair x8 with 0. */ + tmp = partiala; + partiala = v128_ziplo_16(partialb, partiala); + partialb = v128_ziphi_16(partialb, tmp); + /* Square and add the corresponding x and y values. */ + partiala = v128_madd_s16(partiala, partiala); + partialb = v128_madd_s16(partialb, partialb); + /* Multiply by constant. */ + partiala = v128_mullo_s32(partiala, const1); + partialb = v128_mullo_s32(partialb, const2); + /* Sum all results. */ + partiala = v128_add_32(partiala, partialb); + return partiala; +} + +static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { + v128 t0, t1, t2, t3; + t0 = v128_ziplo_32(x1, x0); + t1 = v128_ziplo_32(x3, x2); + t2 = v128_ziphi_32(x1, x0); + t3 = v128_ziphi_32(x3, x2); + x0 = v128_ziplo_64(t1, t0); + x1 = v128_ziphi_64(t1, t0); + x2 = v128_ziplo_64(t3, t2); + x3 = v128_ziphi_64(t3, t2); + return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); +} + +/* Computes cost for directions 0, 5, 6 and 7. We can call this function again + to compute the remaining directions. */ +static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { + v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; + v128 partial6; + v128 tmp; + /* Partial sums for lines 0 and 1. */ + partial4a = v128_shl_n_byte(lines[0], 14); + partial4b = v128_shr_n_byte(lines[0], 2); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); + tmp = v128_add_16(lines[0], lines[1]); + partial5a = v128_shl_n_byte(tmp, 10); + partial5b = v128_shr_n_byte(tmp, 6); + partial7a = v128_shl_n_byte(tmp, 4); + partial7b = v128_shr_n_byte(tmp, 12); + partial6 = tmp; + + /* Partial sums for lines 2 and 3. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); + tmp = v128_add_16(lines[2], lines[3]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); + partial6 = v128_add_16(partial6, tmp); + + /* Partial sums for lines 4 and 5. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); + tmp = v128_add_16(lines[4], lines[5]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); + partial6 = v128_add_16(partial6, tmp); + + /* Partial sums for lines 6 and 7. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); + partial4a = v128_add_16(partial4a, lines[7]); + tmp = v128_add_16(lines[6], lines[7]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); + partial6 = v128_add_16(partial6, tmp); + + /* Compute costs in terms of partial sums. */ + partial4a = + fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), + v128_from_32(105, 120, 140, 168)); + partial7a = + fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), + v128_from_32(105, 105, 105, 140)); + partial5a = + fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), + v128_from_32(105, 105, 105, 140)); + partial6 = v128_madd_s16(partial6, partial6); + partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); + + partial4a = hsum4(partial4a, partial5a, partial6, partial7a); + v128_store_unaligned(tmp_cost1, partial4a); + return partial4a; +} + +/* transpose and reverse the order of the lines -- equivalent to a 90-degree + counter-clockwise rotation of the pixels. */ +static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) { + const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); + const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); + const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); + const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); + const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); + const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); + const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); + const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); + + const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); + const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); + const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); + const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); + const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); + const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); + const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); + const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); + + res[7] = v128_ziplo_64(tr1_1, tr1_0); + res[6] = v128_ziphi_64(tr1_1, tr1_0); + res[5] = v128_ziplo_64(tr1_3, tr1_2); + res[4] = v128_ziphi_64(tr1_3, tr1_2); + res[3] = v128_ziplo_64(tr1_5, tr1_4); + res[2] = v128_ziphi_64(tr1_5, tr1_4); + res[1] = v128_ziplo_64(tr1_7, tr1_6); + res[0] = v128_ziphi_64(tr1_7, tr1_6); +} + +int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + int i; + int32_t cost[8]; + int32_t best_cost = 0; + int best_dir = 0; + v128 lines[8]; + for (i = 0; i < 8; i++) { + lines[i] = v128_load_unaligned(&img[i * stride]); + lines[i] = + v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); + } + + /* Compute "mostly vertical" directions. */ + v128 dir47 = compute_directions(lines, cost + 4); + + array_reverse_transpose_8x8(lines, lines); + + /* Compute "mostly horizontal" directions. */ + v128 dir03 = compute_directions(lines, cost); + + v128 max = v128_max_s32(dir03, dir47); + max = v128_max_s32(max, v128_align(max, max, 8)); + max = v128_max_s32(max, v128_align(max, max, 4)); + best_cost = v128_low_u32(max); + v128 t = + v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); + best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); + best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros + + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var = best_cost - cost[(best_dir + 4) & 7]; + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var >>= 10; + return best_dir; +} + +// Work around compiler out of memory issues with Win32 builds. This issue has +// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4). +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1940 +#define CDEF_INLINE static INLINE +#else +#define CDEF_INLINE SIMD_INLINE +#endif + +// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) +CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, + unsigned int adjdamp) { + v256 diff = v256_sub_16(a, b); + const v256 sign = v256_shr_n_s16(diff, 15); + diff = v256_abs_s16(diff); + const v256 s = + v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); + return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); +} + +SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max, + v256 cdef_large_value_mask) { + if (is_lowbd) { + v256 max_u8; + max_u8 = tap[0]; + max_u8 = v256_max_u8(max_u8, tap[1]); + max_u8 = v256_max_u8(max_u8, tap[2]); + max_u8 = v256_max_u8(max_u8, tap[3]); + /* The source is 16 bits, however, we only really care about the lower + 8 bits. The upper 8 bits contain the "large" flag. After the final + primary max has been calculated, zero out the upper 8 bits. Use this + to find the "16 bit" max. */ + max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); + } else { + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); + } + return max; +} + +SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max, + v256 cdef_large_value_mask) { + if (is_lowbd) { + v256 max_u8; + max_u8 = tap[0]; + max_u8 = v256_max_u8(max_u8, tap[1]); + max_u8 = v256_max_u8(max_u8, tap[2]); + max_u8 = v256_max_u8(max_u8, tap[3]); + max_u8 = v256_max_u8(max_u8, tap[4]); + max_u8 = v256_max_u8(max_u8, tap[5]); + max_u8 = v256_max_u8(max_u8, tap[6]); + max_u8 = v256_max_u8(max_u8, tap[7]); + /* The source is 16 bits, however, we only really care about the lower + 8 bits. The upper 8 bits contain the "large" flag. After the final + primary max has been calculated, zero out the upper 8 bits. Use this + to find the "16 bit" max. */ + max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); + } else { + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); + } + return max; +} + +// MSVC takes far too much time optimizing these. +// https://bugs.chromium.org/p/aomedia/issues/detail?id=3395 +#if defined(_MSC_VER) && !defined(__clang__) +#pragma optimize("", off) +#endif + +CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, int pri_damping, + int sec_damping, int coeff_shift, int height, + int enable_primary, int enable_secondary) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + const int clipping_required = enable_primary && enable_secondary; + v256 p0, p1, p2, p3; + v256 sum, row, res; + v256 max, min; + const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + int i; + + if (enable_primary && pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (enable_secondary && sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + + for (i = 0; i < height; i += 4) { + sum = v256_zero(); + row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); + max = min = row; + + if (enable_primary) { + v256 tap[4]; + // Primary near taps + tap[0] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); + p0 = constrain16(tap[0], row, pri_strength, pri_damping); + tap[1] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); + p1 = constrain16(tap[1], row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); + + // Primary far taps + tap[2] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); + p0 = constrain16(tap[2], row, pri_strength, pri_damping); + tap[3] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); + p1 = constrain16(tap[3], row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); + if (clipping_required) { + max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + } + } + + if (enable_secondary) { + v256 tap[8]; + // Secondary near taps + tap[0] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); + p0 = constrain16(tap[0], row, sec_strength, sec_damping); + tap[1] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); + p1 = constrain16(tap[1], row, sec_strength, sec_damping); + tap[2] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); + p2 = constrain16(tap[2], row, sec_strength, sec_damping); + tap[3] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); + p3 = constrain16(tap[3], row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // Secondary far taps + tap[4] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); + p0 = constrain16(tap[4], row, sec_strength, sec_damping); + tap[5] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); + p1 = constrain16(tap[5], row, sec_strength, sec_damping); + tap[6] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); + p2 = constrain16(tap[6], row, sec_strength, sec_damping); + tap[7] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); + p3 = constrain16(tap[7], row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + if (clipping_required) { + max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + min = v256_min_s16(min, tap[4]); + min = v256_min_s16(min, tap[5]); + min = v256_min_s16(min, tap[6]); + min = v256_min_s16(min, tap[7]); + } + } + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + if (clipping_required) { + res = v256_min_s16(v256_max_s16(res, min), max); + } + + if (is_lowbd) { + const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); + u32_store_aligned(&dst8[(i + 0) * dstride], + v64_high_u32(v128_high_v64(res_128))); + u32_store_aligned(&dst8[(i + 1) * dstride], + v64_low_u32(v128_high_v64(res_128))); + u32_store_aligned(&dst8[(i + 2) * dstride], + v64_high_u32(v128_low_v64(res_128))); + u32_store_aligned(&dst8[(i + 3) * dstride], + v64_low_u32(v128_low_v64(res_128))); + } else { + v64_store_aligned(&dst16[(i + 0) * dstride], + v128_high_v64(v256_high_v128(res))); + v64_store_aligned(&dst16[(i + 1) * dstride], + v128_low_v64(v256_high_v128(res))); + v64_store_aligned(&dst16[(i + 2) * dstride], + v128_high_v64(v256_low_v128(res))); + v64_store_aligned(&dst16[(i + 3) * dstride], + v128_low_v64(v256_low_v128(res))); + } + } +} + +CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, int pri_damping, + int sec_damping, int coeff_shift, int height, + int enable_primary, int enable_secondary) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + const int clipping_required = enable_primary && enable_secondary; + int i; + v256 sum, p0, p1, p2, p3, row, res; + const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); + v256 max, min; + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (enable_primary && pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (enable_secondary && sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + + for (i = 0; i < height; i += 2) { + v256 tap[8]; + sum = v256_zero(); + row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), + v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); + + min = max = row; + if (enable_primary) { + // Primary near taps + tap[0] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); + tap[1] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); + p0 = constrain16(tap[0], row, pri_strength, pri_damping); + p1 = constrain16(tap[1], row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); + + // Primary far taps + tap[2] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); + tap[3] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); + p0 = constrain16(tap[2], row, pri_strength, pri_damping); + p1 = constrain16(tap[3], row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); + + if (clipping_required) { + max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + } + // End primary + } + + if (enable_secondary) { + // Secondary near taps + tap[0] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); + tap[1] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); + tap[2] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); + tap[3] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); + p0 = constrain16(tap[0], row, sec_strength, sec_damping); + p1 = constrain16(tap[1], row, sec_strength, sec_damping); + p2 = constrain16(tap[2], row, sec_strength, sec_damping); + p3 = constrain16(tap[3], row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // Secondary far taps + tap[4] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); + tap[5] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); + tap[6] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); + tap[7] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); + p0 = constrain16(tap[4], row, sec_strength, sec_damping); + p1 = constrain16(tap[5], row, sec_strength, sec_damping); + p2 = constrain16(tap[6], row, sec_strength, sec_damping); + p3 = constrain16(tap[7], row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + if (clipping_required) { + max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + min = v256_min_s16(min, tap[4]); + min = v256_min_s16(min, tap[5]); + min = v256_min_s16(min, tap[6]); + min = v256_min_s16(min, tap[7]); + } + // End secondary + } + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + if (clipping_required) { + res = v256_min_s16(v256_max_s16(res, min), max); + } + + if (is_lowbd) { + const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); + v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); + v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); + } else { + v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); + v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); + } + } +} + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma optimize("", on) +#endif + +SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int height) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + int i; + for (i = 0; i < height; i += 4) { + const v128 row0 = + v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); + const v128 row1 = + v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); + if (is_lowbd) { + /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ + const v128 res_128 = v128_pack_s16_u8(row1, row0); + u32_store_aligned(&dst8[(i + 0) * dstride], + v64_high_u32(v128_low_v64(res_128))); + u32_store_aligned(&dst8[(i + 1) * dstride], + v64_low_u32(v128_low_v64(res_128))); + u32_store_aligned(&dst8[(i + 2) * dstride], + v64_high_u32(v128_high_v64(res_128))); + u32_store_aligned(&dst8[(i + 3) * dstride], + v64_low_u32(v128_high_v64(res_128))); + } else { + v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0)); + v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0)); + v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1)); + v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1)); + } + } +} + +SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int height) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + int i; + for (i = 0; i < height; i += 2) { + const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); + const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); + if (is_lowbd) { + /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ + const v128 res_128 = v128_pack_s16_u8(row1, row0); + v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); + v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); + } else { + v128_store_unaligned(&dst16[i * dstride], row0); + v128_store_unaligned(&dst16[(i + 1) * dstride], row1); + } + } +} + +void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } else { + filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } +} +void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + + if (block_width == 8) { + copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + } else { + copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + } +} + +void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } else { + filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } +} +void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + if (block_width == 8) { + copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + } else { + copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + } +} + +void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + int i, j; + for (i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v128 row = v128_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], row); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +#undef CDEF_INLINE + +#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c new file mode 100644 index 0000000000..0e37d45980 --- /dev/null +++ b/third_party/aom/av1/common/cfl.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/common_data.h" + +#include "config/av1_rtcd.h" + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) { + assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); + assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); + + memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3)); + memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3)); + cfl->subsampling_x = seq_params->subsampling_x; + cfl->subsampling_y = seq_params->subsampling_y; + cfl->are_parameters_computed = 0; + cfl->store_y = 0; + // The DC_PRED cache is disabled by default and is only enabled in + // cfl_rd_pick_alpha + clear_cfl_dc_pred_cache_flags(cfl); +} + +void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, + CFL_PRED_TYPE pred_plane, int width) { + assert(pred_plane < CFL_PRED_PLANES); + assert(width <= CFL_BUF_LINE); + + if (is_cur_buf_hbd(xd)) { + uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input); + memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1); + return; + } + + memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width); +} + +static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst, + int dst_stride, int width, int height) { + for (int j = 0; j < height; j++) { + memcpy(dst, dc_pred_cache, width); + dst += dst_stride; + } +} + +static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst, + int dst_stride, int width, int height) { + const size_t num_bytes = width << 1; + for (int j = 0; j < height; j++) { + memcpy(dst, dc_pred_cache, num_bytes); + dst += dst_stride; + } +} +void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(pred_plane < CFL_PRED_PLANES); + assert(width <= CFL_BUF_LINE); + assert(height <= CFL_BUF_LINE); + if (is_cur_buf_hbd(xd)) { + uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); + cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride, + width, height); + return; + } + cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride, + width, height); +} + +// Due to frame boundary issues, it is possible that the total area covered by +// chroma exceeds that of luma. When this happens, we fill the missing pixels by +// repeating the last columns and/or rows. +static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) { + const int diff_width = width - cfl->buf_width; + const int diff_height = height - cfl->buf_height; + + if (diff_width > 0) { + const int min_height = height - diff_height; + uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width); + for (int j = 0; j < min_height; j++) { + const uint16_t last_pixel = recon_buf_q3[-1]; + assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); + for (int i = 0; i < diff_width; i++) { + recon_buf_q3[i] = last_pixel; + } + recon_buf_q3 += CFL_BUF_LINE; + } + cfl->buf_width = width; + } + if (diff_height > 0) { + uint16_t *recon_buf_q3 = + cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE); + for (int j = 0; j < diff_height; j++) { + const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE; + assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); + for (int i = 0; i < width; i++) { + recon_buf_q3[i] = last_row_q3[i]; + } + recon_buf_q3 += CFL_BUF_LINE; + } + cfl->buf_height = height; + } +} + +static void subtract_average_c(const uint16_t *src, int16_t *dst, int width, + int height, int round_offset, int num_pel_log2) { + int sum = round_offset; + const uint16_t *recon = src; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + sum += recon[i]; + } + recon += CFL_BUF_LINE; + } + const int avg = sum >> num_pel_log2; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = src[i] - avg; + } + src += CFL_BUF_LINE; + dst += CFL_BUF_LINE; + } +} + +CFL_SUB_AVG_FN(c) + +static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign, + CFL_PRED_TYPE pred_type) { + const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign) + : CFL_SIGN_V(joint_sign); + if (alpha_sign == CFL_SIGN_ZERO) return 0; + const int abs_alpha_q3 = + (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx); + return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1; +} + +static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3, int width, + int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]); + } + dst += dst_stride; + ac_buf_q3 += CFL_BUF_LINE; + } +} + +CFL_PREDICT_FN(c, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, + int alpha_q3, int bit_depth, int width, int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = clip_pixel_highbd( + get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth); + } + dst += dst_stride; + ac_buf_q3 += CFL_BUF_LINE; + } +} + +CFL_PREDICT_FN(c, hbd) +#endif + +static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) { + CFL_CTX *const cfl = &xd->cfl; + // Do not call cfl_compute_parameters multiple time on the same values. + assert(cfl->are_parameters_computed == 0); + + cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]); + cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3); + cfl->are_parameters_computed = 1; +} + +void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, int plane) { + CFL_CTX *const cfl = &xd->cfl; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(is_cfl_allowed(xd)); + + if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size); + + const int alpha_q3 = + cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1); + assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <= + CFL_BUF_SQUARE); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); + cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, + alpha_q3, xd->bd); + return; + } +#endif + cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3); +} + +static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + for (int j = 0; j < height; j += 2) { + for (int i = 0; i < width; i += 2) { + const int bot = i + input_stride; + output_q3[i >> 1] = + (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; + } + input += input_stride << 1; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i += 2) { + output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + output_q3[i] = input[i] << 3; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + for (int j = 0; j < height; j += 2) { + for (int i = 0; i < width; i += 2) { + const int bot = i + input_stride; + output_q3[i >> 1] = + (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; + } + input += input_stride << 1; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i += 2) { + output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + output_q3[i] = input[i] << 3; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} +#endif + +CFL_GET_SUBSAMPLE_FUNCTION(c) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size, + int sub_x, int sub_y) { + if (sub_x == 1) { + if (sub_y == 1) { + return cfl_get_luma_subsampling_420_hbd(tx_size); + } + return cfl_get_luma_subsampling_422_hbd(tx_size); + } + return cfl_get_luma_subsampling_444_hbd(tx_size); +} +#endif + +static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size, + int sub_x, int sub_y) { + if (sub_x == 1) { + if (sub_y == 1) { + return cfl_get_luma_subsampling_420_lbd(tx_size); + } + return cfl_get_luma_subsampling_422_lbd(tx_size); + } + return cfl_get_luma_subsampling_444_lbd(tx_size); +} + +static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, + int row, int col, TX_SIZE tx_size, int use_hbd) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int tx_off_log2 = MI_SIZE_LOG2; + const int sub_x = cfl->subsampling_x; + const int sub_y = cfl->subsampling_y; + const int store_row = row << (tx_off_log2 - sub_y); + const int store_col = col << (tx_off_log2 - sub_x); + const int store_height = height >> sub_y; + const int store_width = width >> sub_x; + + // Invalidate current parameters + cfl->are_parameters_computed = 0; + + // Store the surface of the pixel buffer that was written to, this way we + // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the + // frame boundary) + if (col == 0 && row == 0) { + cfl->buf_width = store_width; + cfl->buf_height = store_height; + } else { + cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width); + cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height); + } + + // Check that we will remain inside the pixel buffer. + assert(store_row + store_height <= CFL_BUF_LINE); + assert(store_col + store_width <= CFL_BUF_LINE); + + // Store the input into the CfL pixel buffer + uint16_t *recon_buf_q3 = + cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col); +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input), + input_stride, recon_buf_q3); + } else { + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, + recon_buf_q3); + } +#else + (void)use_hbd; + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); +#endif +} + +// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced +// and non-chroma-referenced blocks are stored together in the CfL buffer. +static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row, + int mi_col, int *row_out, + int *col_out) { + // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s. + if ((mi_row & 0x01) && cfl->subsampling_y) { + assert(*row_out == 0); + (*row_out)++; + } + + // Increment col index for right: 4x8, 4x16 or both right 4x4s. + if ((mi_col & 0x01) && cfl->subsampling_x) { + assert(*col_out == 0); + (*col_out)++; + } +} + +void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, + BLOCK_SIZE bsize) { + CFL_CTX *const cfl = &xd->cfl; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; + + if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { + // Only dimensions of size 4 can have an odd offset. + assert(!((col & 1) && tx_size_wide[tx_size] != 4)); + assert(!((row & 1) && tx_size_high[tx_size] != 4)); + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); + } + cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); +} + +static INLINE int max_intra_block_width(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]); +} + +static INLINE int max_intra_block_height(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); +} + +void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { + CFL_CTX *const cfl = &xd->cfl; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int row = 0; + int col = 0; + + if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); + } + const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size); + const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size); + tx_size = get_tx_size(width, height); + cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size, + is_cur_buf_hbd(xd)); +} diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h new file mode 100644 index 0000000000..dcaa87bd48 --- /dev/null +++ b/third_party/aom/av1/common/cfl.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CFL_H_ +#define AOM_AV1_COMMON_CFL_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +// Can we use CfL for the current block? +static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize < BLOCK_SIZES_ALL); + if (xd->lossless[mbmi->segment_id]) { + // In lossless, CfL is available when the partition size is equal to the + // transform size. + const int ssx = xd->plane[AOM_PLANE_U].subsampling_x; + const int ssy = xd->plane[AOM_PLANE_U].subsampling_y; + const int plane_bsize = get_plane_block_size(bsize, ssx, ssy); + return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4); + } + // Spec: CfL is available to luma partitions lesser than or equal to 32x32 + return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 && + block_size_high[bsize] <= 32); +} + +// Do we need to save the luma pixels from the current block, +// for a possible future CfL prediction? +static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + + if (cm->seq_params->monochrome) return CFL_DISALLOWED; + + if (!xd->is_chroma_ref) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // If this block has chroma information, we know whether we're + // actually going to perform a CfL prediction + return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) && + mbmi->uv_mode == UV_CFL_PRED); +} + +static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) { + int scaled_luma_q6 = alpha_q3 * pred_buf_q3; + return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6); +} + +static INLINE CFL_PRED_TYPE get_cfl_pred_type(int plane) { + assert(plane > 0); + return (CFL_PRED_TYPE)(plane - 1); +} + +static INLINE void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) { + cfl->use_dc_pred_cache = false; + cfl->dc_pred_is_cached[CFL_PRED_U] = false; + cfl->dc_pred_is_cached[CFL_PRED_V] = false; +} + +void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, int plane); + +void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size); + +void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, + BLOCK_SIZE bsize); + +void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, + CFL_PRED_TYPE pred_plane, int width); + +void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, CFL_PRED_TYPE pred_plane); + +// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth. +#define CFL_lbd_TYPE uint8_t *cfl_type +#define CFL_hbd_TYPE uint16_t *cfl_type + +// Declare a size-specific wrapper for the size-generic function. The compiler +// will inline the size generic function in here, the advantage is that the size +// will be constant allowing for loop unrolling and other constant propagated +// goodness. +#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ + void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ + const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ + cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ + output_q3, width, height); \ + } + +// Declare size-specific wrappers for all valid CfL sizes. +#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \ + TX_SIZE tx_size) { \ + CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + return subfn_##sub[tx_size]; \ + } + +// Declare an architecture-specific array of function pointers for size-specific +// wrappers. +#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; + +// The RTCD script does not support passing in an array, so we wrap it in this +// function. +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd) +#else +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) +#endif + +// Declare a size-specific wrapper for the size-generic function. The compiler +// will inline the size generic function in here, the advantage is that the size +// will be constant allowing for loop unrolling and other constant propagated +// goodness. +#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ + void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ + int16_t *dst) { \ + subtract_average_##arch(src, dst, width, height, round_offset, \ + num_pel_log2); \ + } + +// Declare size-specific wrappers for all valid CfL sizes. +#define CFL_SUB_AVG_FN(arch) \ + CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \ + CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \ + CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \ + CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \ + CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \ + CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \ + CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \ + CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \ + cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \ + TX_SIZE tx_size) { \ + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ + cfl_subtract_average_4x4_##arch, /* 4x4 */ \ + cfl_subtract_average_8x8_##arch, /* 8x8 */ \ + cfl_subtract_average_16x16_##arch, /* 16x16 */ \ + cfl_subtract_average_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subtract_average_4x8_##arch, /* 4x8 */ \ + cfl_subtract_average_8x4_##arch, /* 8x4 */ \ + cfl_subtract_average_8x16_##arch, /* 8x16 */ \ + cfl_subtract_average_16x8_##arch, /* 16x8 */ \ + cfl_subtract_average_16x32_##arch, /* 16x32 */ \ + cfl_subtract_average_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ + cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ + cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ + cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return sub_avg[tx_size % TX_SIZES_ALL]; \ + } + +// For VSX SIMD optimization, the C versions of width == 4 subtract are +// faster than the VSX. As such, the VSX code calls the C versions. +void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); + +#define CFL_PREDICT_lbd(arch, width, height) \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ + int alpha_q3) { \ + cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ + height); \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_PREDICT_hbd(arch, width, height) \ + void cfl_predict_hbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ + int bd) { \ + cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ + height); \ + } +#endif + +// This wrapper exists because clang format does not like calling macros with +// lowercase letters. +#define CFL_PREDICT_X(arch, width, height, bd) \ + CFL_PREDICT_##bd(arch, width, height) + +#define CFL_PREDICT_FN(arch, bd) \ + CFL_PREDICT_X(arch, 4, 4, bd) \ + CFL_PREDICT_X(arch, 4, 8, bd) \ + CFL_PREDICT_X(arch, 4, 16, bd) \ + CFL_PREDICT_X(arch, 8, 4, bd) \ + CFL_PREDICT_X(arch, 8, 8, bd) \ + CFL_PREDICT_X(arch, 8, 16, bd) \ + CFL_PREDICT_X(arch, 8, 32, bd) \ + CFL_PREDICT_X(arch, 16, 4, bd) \ + CFL_PREDICT_X(arch, 16, 8, bd) \ + CFL_PREDICT_X(arch, 16, 16, bd) \ + CFL_PREDICT_X(arch, 16, 32, bd) \ + CFL_PREDICT_X(arch, 32, 8, bd) \ + CFL_PREDICT_X(arch, 32, 16, bd) \ + CFL_PREDICT_X(arch, 32, 32, bd) \ + cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ + static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ + cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \ + cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \ + cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \ + cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \ + cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \ + cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \ + cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \ + cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \ + cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \ + cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \ + cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \ + cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return pred[tx_size % TX_SIZES_ALL]; \ + } + +#endif // AOM_AV1_COMMON_CFL_H_ diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h new file mode 100644 index 0000000000..ccb45b68ce --- /dev/null +++ b/third_party/aom/av1/common/common.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_COMMON_H_ +#define AOM_AV1_COMMON_COMMON_H_ + +/* Interface header for common constant data structures and lookup tables */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom/aom_integer.h" +#include "aom_ports/bitops.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Only need this for fixed-size arrays, for structs just assign. +#define av1_copy(dest, src) \ + do { \ + assert(sizeof(dest) == sizeof(src)); \ + memcpy(dest, src, sizeof(src)); \ + } while (0) + +// Use this for variably-sized arrays. +#define av1_copy_array(dest, src, n) \ + do { \ + assert(sizeof(*(dest)) == sizeof(*(src))); \ + memcpy(dest, src, n * sizeof(*(src))); \ + } while (0) + +#define av1_zero(dest) memset(&(dest), 0, sizeof(dest)) +#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest))) + +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; +} + +#define CHECK_MEM_ERROR(cm, lval, expr) \ + AOM_CHECK_MEM_ERROR((cm)->error, lval, expr) + +#define AOM_FRAME_MARKER 0x2 + +#define AV1_MIN_TILE_SIZE_BYTES 1 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_COMMON_H_ diff --git a/third_party/aom/av1/common/common_data.c b/third_party/aom/av1/common/common_data.c new file mode 100644 index 0000000000..482aecfcc0 --- /dev/null +++ b/third_party/aom/av1/common/common_data.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common_data.h" + +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). +/* clang-format off */ +const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2] = { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } }, + { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, + { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, + { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, + { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, + { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, + { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, + { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, + { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, + { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, + { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, + { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, + { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, + { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } }, + { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, + { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, + { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, + { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } +}; +/* clang-format on */ diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h new file mode 100644 index 0000000000..dfe927c6ef --- /dev/null +++ b/third_party/aom/av1/common/common_data.h @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_COMMON_DATA_H_ +#define AOM_AV1_COMMON_COMMON_DATA_H_ + +#include "av1/common/enums.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Log 2 conversion lookup tables in units of mode info (4x4). +// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4 +}; +// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = { + 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2 +}; + +// Width/height lookup tables in units of mode info (4x4). +// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = { + 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16 +}; + +// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = { + 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4 +}; + +// Width/height lookup tables in units of samples. +// The Block_Width table in the spec (Section 9.3. Conversion tables). +static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = { + 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, + 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64 +}; + +// The Block_Height table in the spec (Section 9.3. Conversion tables). +static const uint8_t block_size_high[BLOCK_SIZES_ALL] = { + 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, + 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16 +}; + +// Maps a block size to a context. +// The Size_Group table in the spec (Section 9.3. Conversion tables). +// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize))) +static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 +}; + +static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = { + 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10 +}; + +// A compressed version of the Partition_Subsize table in the spec (9.3. +// Conversion tables), for square block sizes only. +/* clang-format off */ +static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { + { // PARTITION_NONE + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, + BLOCK_32X32, BLOCK_64X64, BLOCK_128X128 + }, { // PARTITION_HORZ + BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_VERT + BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_SPLIT + BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8, + BLOCK_16X16, BLOCK_32X32, BLOCK_64X64 + }, { // PARTITION_HORZ_A + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_HORZ_B + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_VERT_A + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_VERT_B + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_HORZ_4 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4, + BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID + }, { // PARTITION_VERT_4 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID + } +}; + +static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X4, TX_4X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X8, TX_8X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X16, TX_16X16, TX_32X32, + // 32X64, 64X32, + TX_32X32, TX_32X32, + // 64X64 + TX_64X64, + // 64x128, 128x64, 128x128 + TX_64X64, TX_64X64, TX_64X64, + // 4x16, 16x4, 8x32 + TX_4X4, TX_4X4, TX_8X8, + // 32x8, 16x64 64x16 + TX_8X8, TX_16X16, TX_16X16 +}; + +static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X8, TX_8X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X16, TX_16X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X32, TX_32X16, TX_32X32, + // 32X64, 64X32, + TX_32X64, TX_64X32, + // 64X64 + TX_64X64, + // 64x128, 128x64, 128x128 + TX_64X64, TX_64X64, TX_64X64, + // 4x16, 16x4, + TX_4X16, TX_16X4, + // 8x32, 32x8 + TX_8X32, TX_32X8, + // 16x64, 64x16 + TX_16X64, TX_64X16 +}; + +static const TX_TYPE_1D vtx_tab[TX_TYPES] = { + DCT_1D, ADST_1D, DCT_1D, ADST_1D, + FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D, + DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D, +}; + +static const TX_TYPE_1D htx_tab[TX_TYPES] = { + DCT_1D, DCT_1D, ADST_1D, ADST_1D, + DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D, + IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, +}; + +#define TXSIZE_CAT_INVALID (-1) + +/* clang-format on */ + +static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_4X4, // TX_8X8 + TX_8X8, // TX_16X16 + TX_16X16, // TX_32X32 + TX_32X32, // TX_64X64 + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 + TX_8X8, // TX_8X16 + TX_8X8, // TX_16X8 + TX_16X16, // TX_16X32 + TX_16X16, // TX_32X16 + TX_32X32, // TX_32X64 + TX_32X32, // TX_64X32 + TX_4X8, // TX_4X16 + TX_8X4, // TX_16X4 + TX_8X16, // TX_8X32 + TX_16X8, // TX_32X8 + TX_16X32, // TX_16X64 + TX_32X16, // TX_64X16 +}; + +static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_4X4, // TX_4X8 + TX_8X8, // TX_8X4 + TX_8X8, // TX_8X16 + TX_16X16, // TX_16X8 + TX_16X16, // TX_16X32 + TX_32X32, // TX_32X16 + TX_32X32, // TX_32X64 + TX_64X64, // TX_64X32 + TX_4X4, // TX_4X16 + TX_16X16, // TX_16X4 + TX_8X8, // TX_8X32 + TX_32X32, // TX_32X8 + TX_16X16, // TX_16X64 + TX_64X64, // TX_64X16 +}; + +static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_8X8, // TX_4X8 + TX_4X4, // TX_8X4 + TX_16X16, // TX_8X16 + TX_8X8, // TX_16X8 + TX_32X32, // TX_16X32 + TX_16X16, // TX_32X16 + TX_64X64, // TX_32X64 + TX_32X32, // TX_64X32 + TX_16X16, // TX_4X16 + TX_4X4, // TX_16X4 + TX_32X32, // TX_8X32 + TX_8X8, // TX_32X8 + TX_64X64, // TX_16X64 + TX_16X16, // TX_64X16 +}; + +#define TX_SIZE_W_MIN 4 + +// Transform block width in pixels +static const int tx_size_wide[TX_SIZES_ALL] = { + 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, +}; + +#define TX_SIZE_H_MIN 4 + +// Transform block height in pixels +static const int tx_size_high[TX_SIZES_ALL] = { + 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, +}; + +// Transform block width in unit +static const int tx_size_wide_unit[TX_SIZES_ALL] = { + 1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16, +}; + +// Transform block height in unit +static const int tx_size_high_unit[TX_SIZES_ALL] = { + 1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4, +}; + +// Transform block width in log2 +static const int tx_size_wide_log2[TX_SIZES_ALL] = { + 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, +}; + +// Transform block width in log2 unit +static const int tx_size_wide_unit_log2[TX_SIZES_ALL] = { + 0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4, +}; + +// Transform block height in log2 +static const int tx_size_high_log2[TX_SIZES_ALL] = { + 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, +}; + +// Transform block height in log2 unit +static const int tx_size_high_unit_log2[TX_SIZES_ALL] = { + 0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2, +}; + +static const int tx_size_2d[TX_SIZES_ALL + 1] = { + 16, 64, 256, 1024, 4096, 32, 32, 128, 128, 512, + 512, 2048, 2048, 64, 64, 256, 256, 1024, 1024, +}; + +static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 + BLOCK_64X64, // TX_64X64 + BLOCK_4X8, // TX_4X8 + BLOCK_8X4, // TX_8X4 + BLOCK_8X16, // TX_8X16 + BLOCK_16X8, // TX_16X8 + BLOCK_16X32, // TX_16X32 + BLOCK_32X16, // TX_32X16 + BLOCK_32X64, // TX_32X64 + BLOCK_64X32, // TX_64X32 + BLOCK_4X16, // TX_4X16 + BLOCK_16X4, // TX_16X4 + BLOCK_8X32, // TX_8X32 + BLOCK_32X8, // TX_32X8 + BLOCK_16X64, // TX_16X64 + BLOCK_64X16, // TX_64X16 +}; + +static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 + TX_8X8, // TX_8X16 + TX_8X8, // TX_16X8 + TX_16X16, // TX_16X32 + TX_16X16, // TX_32X16 + TX_32X32, // TX_32X64 + TX_32X32, // TX_64X32 + TX_4X4, // TX_4X16 + TX_4X4, // TX_16X4 + TX_8X8, // TX_8X32 + TX_8X8, // TX_32X8 + TX_16X16, // TX_16X64 + TX_16X16, // TX_64X16 +}; + +static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_8X8, // TX_4X8 + TX_8X8, // TX_8X4 + TX_16X16, // TX_8X16 + TX_16X16, // TX_16X8 + TX_32X32, // TX_16X32 + TX_32X32, // TX_32X16 + TX_64X64, // TX_32X64 + TX_64X64, // TX_64X32 + TX_16X16, // TX_4X16 + TX_16X16, // TX_16X4 + TX_32X32, // TX_8X32 + TX_32X32, // TX_32X8 + TX_64X64, // TX_16X64 + TX_64X64, // TX_64X16 +}; + +static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = { + 0, // TX_4X4 + 2, // TX_8X8 + 4, // TX_16X16 + 6, // TX_32X32 + 6, // TX_64X64 + 1, // TX_4X8 + 1, // TX_8X4 + 3, // TX_8X16 + 3, // TX_16X8 + 5, // TX_16X32 + 5, // TX_32X16 + 6, // TX_32X64 + 6, // TX_64X32 + 2, // TX_4X16 + 2, // TX_16X4 + 4, // TX_8X32 + 4, // TX_32X8 + 5, // TX_16X64 + 5, // TX_64X16 +}; + +static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_64X64, // TX_MODE_LARGEST + TX_64X64, // TX_MODE_SELECT +}; + +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). +extern const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2]; + +// Generates 5 bit field in which each bit set to 1 represents +// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16 +// and 8x8. 10000 means we just split the 128x128 to 64x64 +/* clang-format off */ +static const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES_ALL] = { + { 31, 31 }, // 4X4 - {0b11111, 0b11111} + { 31, 30 }, // 4X8 - {0b11111, 0b11110} + { 30, 31 }, // 8X4 - {0b11110, 0b11111} + { 30, 30 }, // 8X8 - {0b11110, 0b11110} + { 30, 28 }, // 8X16 - {0b11110, 0b11100} + { 28, 30 }, // 16X8 - {0b11100, 0b11110} + { 28, 28 }, // 16X16 - {0b11100, 0b11100} + { 28, 24 }, // 16X32 - {0b11100, 0b11000} + { 24, 28 }, // 32X16 - {0b11000, 0b11100} + { 24, 24 }, // 32X32 - {0b11000, 0b11000} + { 24, 16 }, // 32X64 - {0b11000, 0b10000} + { 16, 24 }, // 64X32 - {0b10000, 0b11000} + { 16, 16 }, // 64X64 - {0b10000, 0b10000} + { 16, 0 }, // 64X128- {0b10000, 0b00000} + { 0, 16 }, // 128X64- {0b00000, 0b10000} + { 0, 0 }, // 128X128-{0b00000, 0b00000} + { 31, 28 }, // 4X16 - {0b11111, 0b11100} + { 28, 31 }, // 16X4 - {0b11100, 0b11111} + { 30, 24 }, // 8X32 - {0b11110, 0b11000} + { 24, 30 }, // 32X8 - {0b11000, 0b11110} + { 28, 16 }, // 16X64 - {0b11100, 0b10000} + { 16, 28 }, // 64X16 - {0b10000, 0b11100} +}; +/* clang-format on */ + +static const int intra_mode_context[INTRA_MODES] = { + 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0, +}; + +// Note: this is also used in unit tests. So whenever one changes the table, +// the unit tests need to be changed accordingly. +static const int quant_dist_weight[4][2] = { + { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE } +}; + +static const int quant_dist_lookup_table[4][2] = { + { 9, 7 }, + { 11, 5 }, + { 12, 4 }, + { 13, 3 }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_COMMON_DATA_H_ diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c new file mode 100644 index 0000000000..bb72e0cbd2 --- /dev/null +++ b/third_party/aom/av1/common/convolve.c @@ -0,0 +1,1508 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/resize.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn, int bd) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + + // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can + // be beyond the following range. For better prediction, a clamping can be + // added for 12 tap filter to ensure the horizontal filtering result is + // within 16 bit. The same applies to the vertical filtering. + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } + } +} + +void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + const int fo_vert = filter_params_y->taps / 2 - 1; + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); + } + } +} + +void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } + } +} + +// This function is exactly the same as av1_convolve_2d_sr_c, and is an +// optimized version for intrabc. Use the following 2-tap filter: +// DECLARE_ALIGNED(256, static const int16_t, +// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { +// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// }; +void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int bd = 8; + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + int16_t *im = im_block; + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t sum = (1 << bd) + src[x] + src[x + 1]; + assert(0 <= sum && sum < (1 << (bd + 2))); + im[x] = sum; + } + src += src_stride; + im += im_stride; + } + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + int16_t *src_vert = im_block; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t sum = + (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x]; + assert(0 <= sum && sum < (1 << (bd + 4))); + const int16_t res = + ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1))); + dst[x] = clip_pixel(res); + } + src_vert += im_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_convolve_y_sr_c, and is an +// optimized version for intrabc. +void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t res = src[x] + src[src_stride + x]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); + } + src += src_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_convolve_x_sr_c, and is an +// optimized version for intrabc. +void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t res = src[x] + src[x + 1]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + for (int y = 0; y < im_h; ++y) { + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *x_filter = + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_x[k - fo_horiz]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + src_horiz += src_stride; + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int x = 0; x < w; ++x) { + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(y_filter_idx < SUBPEL_SHIFTS); + const int16_t *y_filter = + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } + } + src_vert++; + } +} + +static void convolve_2d_scale_wrapper( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, + y_step_qn, conv_params); +} + +static void convolve_2d_facade_compound( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + if (!need_x && !need_y) { + av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, + conv_params); + } else if (need_x && !need_y) { + av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } else if (!need_x && need_y) { + av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, conv_params); + } else { + assert(need_y && need_x); + av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } +} + +static void convolve_2d_facade_single( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + if (!need_x && !need_y) { + aom_convolve_copy(src, src_stride, dst, dst_stride, w, h); + } else if (need_x && !need_y) { + av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } else if (!need_x && need_y) { + av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y, + subpel_y_qn); + } else { + assert(need_x && need_y); + av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); + } +} + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst; + (void)dst_stride; + + const InterpFilterParams *filter_params_x = interp_filters[0]; + const InterpFilterParams *filter_params_y = interp_filters[1]; + + // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case. + // 2-tap filter indicates that it is for IntraBC. + if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert(!scaled); + if (subpel_x_qn && subpel_y_qn) { + av1_convolve_2d_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params); + return; + } else if (subpel_x_qn) { + av1_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + return; + } else if (subpel_y_qn) { + av1_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + return; + } + } + + if (scaled) { + convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params); + } else if (conv_params->is_compound) { + convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } else { + convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + } +} + +void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + const int fo_vert = filter_params_y->taps / 2 - 1; + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); + } + } +} + +void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + } +} + +// This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an +// optimized version for intrabc. Use the following 2-tap filter: +// DECLARE_ALIGNED(256, static const int16_t, +// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { +// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// }; +void av1_highbd_convolve_2d_sr_intrabc_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + int16_t *im = im_block; + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]); + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0); + im[x] = sum; + } + src += src_stride; + im += im_stride; + } + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + int16_t *src_vert = im_block; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t sum = + (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]); + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + src_vert += im_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an +// optimized version for intrabc. +void av1_highbd_convolve_y_sr_intrabc_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + int bd) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t res = src[x] + src[src_stride + x]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd); + } + src += src_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an +// optimized version for intrabc. +void av1_highbd_convolve_x_sr_intrabc_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + const int bits = FILTER_BITS - conv_params->round_0; + assert(bits >= 0); + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 64 * (src[x] + src[x + 1]); + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_highbd_dist_wtd_convolve_2d_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + int x, y, k; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (y = 0; y < im_h; ++y) { + for (x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + (void)bd; + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + assert(bits >= 0); + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + assert(bits >= 0); + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + assert(bits >= 0); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + for (int y = 0; y < im_h; ++y) { + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *x_filter = + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_x[k - fo_horiz]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + src_horiz += src_stride; + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int x = 0; x < w; ++x) { + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(y_filter_idx < SUBPEL_SHIFTS); + const int16_t *y_filter = + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } + } + src_vert++; + } +} + +static void highbd_convolve_2d_facade_compound( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const int w, const int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + if (!need_x && !need_y) { + av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, + conv_params, bd); + } else if (need_x && !need_y) { + av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, + bd); + } else if (!need_x && need_y) { + av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, conv_params, + bd); + } else { + assert(need_x && need_y); + av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + } +} + +static void highbd_convolve_2d_facade_single( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const int w, const int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + + if (!need_x && !need_y) { + aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h); + } else if (need_x && !need_y) { + av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, bd); + } else if (!need_x && need_y) { + av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + } else { + assert(need_x && need_y); + av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params, bd); + } +} + +void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, + uint8_t *dst8, int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, + int scaled, ConvolveParams *conv_params, + int bd) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst_stride; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + const InterpFilterParams *filter_params_x = interp_filters[0]; + const InterpFilterParams *filter_params_y = interp_filters[1]; + + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + // 2-tap filter indicates that it is for IntraBC. + if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert(!scaled); + if (subpel_x_qn && subpel_y_qn) { + av1_highbd_convolve_2d_sr_intrabc_c( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } else if (subpel_x_qn) { + av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, + conv_params, bd); + return; + } else if (subpel_y_qn) { + av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + } + + if (scaled) { + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params, + bd); + } else if (conv_params->is_compound) { + highbd_convolve_2d_facade_compound( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); + } else { + highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 128x128 pixels. +// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((128 - 1) * 32 + 15) >> 4 + 8 = 263. +#define WIENER_MAX_EXT_SIZE 263 + +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} +#endif + +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, + int round0_bits) { + const int bd = 8; + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, + int round1_bits) { + const int bd = 8; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, + x_step_q4, w, intermediate_height, + conv_params->round_0); + convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, conv_params->round_1); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_convolve_add_src_horiz_hip( + const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int round0_bits, int bd) { + const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + extraprec_clamp_limit - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert_hip( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int round1_bits, int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void av1_highbd_wiener_convolve_add_src_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + + highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, + x0_q4, x_step_q4, w, intermediate_height, + conv_params->round_0, bd); + highbd_convolve_add_src_vert_hip( + temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, + filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h new file mode 100644 index 0000000000..d6dd8763c3 --- /dev/null +++ b/third_party/aom/av1/common/convolve.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CONVOLVE_H_ +#define AOM_AV1_COMMON_CONVOLVE_H_ +#include "av1/common/filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint16_t CONV_BUF_TYPE; +typedef struct ConvolveParams { + int do_average; + CONV_BUF_TYPE *dst; + int dst_stride; + int round_0; + int round_1; + int plane; + int is_compound; + int use_dist_wtd_comp_avg; + int fwd_offset; + int bck_offset; +} ConvolveParams; + +typedef struct WienerConvolveParams { + int round_0; + int round_1; +} WienerConvolveParams; + +#define ROUND0_BITS 3 +#define COMPOUND_ROUND1_BITS 7 +#define WIENER_ROUND0_BITS 3 + +#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0)) + +typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params); + +typedef void (*aom_highbd_convolve_fn_t)( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); + +struct AV1Common; +struct scale_factors; + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params); + +static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane, + CONV_BUF_TYPE *dst, + int dst_stride, + int is_compound, int bd) { + ConvolveParams conv_params; + assert(IMPLIES(cmp_index, is_compound)); + + conv_params.is_compound = is_compound; + conv_params.use_dist_wtd_comp_avg = 0; + conv_params.round_0 = ROUND0_BITS; + conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS + : 2 * FILTER_BITS - conv_params.round_0; +#if CONFIG_AV1_HIGHBITDEPTH + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + assert(IMPLIES(bd < 12, intbufrange <= 16)); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + if (!is_compound) conv_params.round_1 -= intbufrange - 16; + } +#else + (void)bd; +#endif // CONFIG_AV1_HIGHBITDEPTH + // TODO(yunqing): The following dst should only be valid while + // is_compound = 1; + conv_params.dst = dst; + conv_params.dst_stride = dst_stride; + conv_params.plane = plane; + + // By default, set do average to 1 if this is the second single prediction + // in a compound mode. + conv_params.do_average = cmp_index; + return conv_params; +} + +static INLINE ConvolveParams get_conv_params(int do_average, int plane, + int bd) { + return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd); +} + +static INLINE WienerConvolveParams get_conv_params_wiener(int bd) { + WienerConvolveParams conv_params; + conv_params.round_0 = WIENER_ROUND0_BITS; + conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0; + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + assert(IMPLIES(bd < 12, intbufrange <= 16)); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + conv_params.round_1 -= intbufrange - 16; + } + return conv_params; +} + +void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, + int scaled, ConvolveParams *conv_params, + int bd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_CONVOLVE_H_ diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c new file mode 100644 index 0000000000..7e6160f9a5 --- /dev/null +++ b/third_party/aom/av1/common/debugmodes.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number, + cm->show_frame, cm->quant_params.base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor, + size_t member_offset) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO **mi = mi_params->mi_grid_base; + int rows = mi_params->mi_rows; + int cols = mi_params->mi_cols; + char prefix = descriptor[0]; + + log_frame_info(cm, descriptor, file); + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); + mi++; + } + fprintf(file, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(file, "\n"); +} + +void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { + CommonModeInfoParams *mi_params = &cm->mi_params; + FILE *mvs = fopen(file, "a"); + MB_MODE_INFO **mi = mi_params->mi_grid_base; + const int rows = mi_params->mi_rows; + const int cols = mi_params->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, bsize)); + print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); + + // output skip infomation. + log_frame_info(cm, "Skips:", mvs); + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "S "); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[0]->skip_txfm); + mi++; + } + fprintf(mvs, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(mvs, "\n"); + + // output motion vectors. + log_frame_info(cm, "Vectors ", mvs); + mi = mi_params->mi_grid_base; + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "V "); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col); + mi++; + } + fprintf(mvs, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(mvs, "\n"); + + fclose(mvs); +} + +void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename) { + FILE *hdrFile = fopen(filename, "w"); + fwrite(data, size, sizeof(uint8_t), hdrFile); + + // Reset order hints(7bit + a previous bit) to 0, so that all camera frame + // headers are identical in large scale coding. + uint8_t zero = 0; + fseek(hdrFile, 1, SEEK_SET); + // Reset second byte. + fwrite(&zero, 1, sizeof(uint8_t), hdrFile); + fclose(hdrFile); +} + +void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) { + FILE *fcFile = fopen(filename, "w"); + const uint16_t *fcp = (uint16_t *)fc; + const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t); + unsigned int i; + + for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++); + fclose(fcFile); +} diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c new file mode 100644 index 0000000000..97d95ea394 --- /dev/null +++ b/third_party/aom/av1/common/entropy.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/scan.h" +#include "av1/common/token_cdfs.h" +#include "av1/common/txb_common.h" + +static int get_q_ctx(int q) { + if (q <= 20) return 0; + if (q <= 60) return 1; + if (q <= 120) return 2; + return 3; +} + +void av1_default_coef_probs(AV1_COMMON *cm) { + const int index = get_q_ctx(cm->quant_params.base_qindex); +#if CONFIG_ENTROPY_STATS + cm->coef_cdf_category = index; +#endif + + av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]); + av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]); + av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]); + av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]); + av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]); + av1_copy(cm->fc->coeff_base_eob_cdf, + av1_default_coeff_base_eob_multi_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]); +} + +static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, + int num_cdfs, int cdf_stride, + int nsymbs) { + for (int i = 0; i < num_cdfs; i++) { + cdf_ptr[i * cdf_stride + nsymbs] = 0; + } +} + +#define RESET_CDF_COUNTER(cname, nsymbs) \ + RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs)) + +#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname; \ + int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \ + } while (0) + +static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) { + RESET_CDF_COUNTER(nmv->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES); + RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2); + } +} + +void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) { + RESET_CDF_COUNTER(fc->txb_skip_cdf, 2); + RESET_CDF_COUNTER(fc->eob_extra_cdf, 2); + RESET_CDF_COUNTER(fc->dc_sign_cdf, 2); + RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5); + RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6); + RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7); + RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8); + RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9); + RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10); + RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11); + RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3); + RESET_CDF_COUNTER(fc->coeff_base_cdf, 4); + RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE); + RESET_CDF_COUNTER(fc->newmv_cdf, 2); + RESET_CDF_COUNTER(fc->zeromv_cdf, 2); + RESET_CDF_COUNTER(fc->refmv_cdf, 2); + RESET_CDF_COUNTER(fc->drl_cdf, 2); + RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES); + RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16); + RESET_CDF_COUNTER(fc->interintra_cdf, 2); + RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2); + RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES); + RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES); + RESET_CDF_COUNTER(fc->obmc_cdf, 2); + RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES); + RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2); + RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2); + RESET_CDF_COUNTER(fc->comp_inter_cdf, 2); + RESET_CDF_COUNTER(fc->single_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2); + RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2); + RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2); + RESET_CDF_COUNTER(fc->compound_index_cdf, 2); + RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2); + RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2); + RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2); + RESET_CDF_COUNTER(fc->intra_inter_cdf, 2); + reset_nmv_counter(&fc->nmvc); + reset_nmv_counter(&fc->ndvc); + RESET_CDF_COUNTER(fc->intrabc_cdf, 2); + RESET_CDF_COUNTER(fc->seg.pred_cdf, 2); + RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2); + RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES); + RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES); + RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2); + RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2); + RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES); + RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1, + CDF_SIZE(UV_INTRA_MODES)); + RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10)); + } else if (i < 16) { + RESET_CDF_COUNTER(fc->partition_cdf[i], 10); + } else { + RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10)); + } + } + RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS); + RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES); + RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1); + RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1); + RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1); + } + RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS); + RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE); +} diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h new file mode 100644 index 0000000000..53ef3b1c89 --- /dev/null +++ b/third_party/aom/av1/common/entropy.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPY_H_ +#define AOM_AV1_COMMON_ENTROPY_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define TOKEN_CDF_Q_CTXS 4 + +#define TXB_SKIP_CONTEXTS 13 + +#define EOB_COEF_CONTEXTS 9 + +#define SIG_COEF_CONTEXTS_2D 26 +#define SIG_COEF_CONTEXTS_1D 16 +#define SIG_COEF_CONTEXTS_EOB 4 +#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D) + +#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS) +#define DC_SIGN_CONTEXTS 3 + +#define BR_TMP_OFFSET 12 +#define BR_REF_CAT 4 +#define LEVEL_CONTEXTS 21 + +#define NUM_BASE_LEVELS 2 + +#define BR_CDF_SIZE (4) +#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1)) + +#define COEFF_CONTEXT_BITS 3 +#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1) +#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1) + +#define BASE_CONTEXT_POSITION_NUM 12 + +enum { + TX_CLASS_2D = 0, + TX_CLASS_HORIZ = 1, + TX_CLASS_VERT = 2, + TX_CLASSES = 3, +} UENUM1BYTE(TX_CLASS); + +#define DCT_MAX_VALUE 16384 +#define DCT_MAX_VALUE_HIGH10 65536 +#define DCT_MAX_VALUE_HIGH12 262144 + +/* Coefficients are predicted via a 3-dimensional probability table indexed on + * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */ +#define REF_TYPES 2 // intra=0, inter=1 + +struct AV1Common; +struct frame_contexts; +void av1_reset_cdf_symbol_counters(struct frame_contexts *fc); +void av1_default_coef_probs(struct AV1Common *cm); +void av1_init_mode_probs(struct frame_contexts *fc); + +struct frame_contexts; + +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; + + switch (tx_size) { + case TX_4X4: + above_ec = a[0] != 0; + left_ec = l[0] != 0; + break; + case TX_4X8: + above_ec = a[0] != 0; + left_ec = !!*(const uint16_t *)l; + break; + case TX_8X4: + above_ec = !!*(const uint16_t *)a; + left_ec = l[0] != 0; + break; + case TX_8X16: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_16X8: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X32: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_32X16: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_8X8: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X16: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_32X32: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_64X64: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_32X64: + above_ec = !!*(const uint64_t *)a; + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_64X32: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!*(const uint64_t *)l; + break; + case TX_4X16: + above_ec = a[0] != 0; + left_ec = !!*(const uint32_t *)l; + break; + case TX_16X4: + above_ec = !!*(const uint32_t *)a; + left_ec = l[0] != 0; + break; + case TX_8X32: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_32X8: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X64: + above_ec = !!*(const uint32_t *)a; + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_64X16: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!*(const uint32_t *)l; + break; + default: assert(0 && "Invalid transform size."); break; + } + return combine_entropy_contexts(above_ec, left_ec); +} + +static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) { + return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >> + 1); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPY_H_ diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c new file mode 100644 index 0000000000..8381c1fdd0 --- /dev/null +++ b/third_party/aom/av1/common/entropymode.c @@ -0,0 +1,1094 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_mem/aom_mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#include "av1/common/txb_common.h" + +static const aom_cdf_prob + default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE( + INTRA_MODES)] = { + { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, + 24189, 28165, 29093, 30466) }, + { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032, + 24434, 28658, 30172, 31409) }, + { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620, + 26160, 29336, 29929, 31567) }, + { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096, + 24746, 29585, 30958, 32462) }, + { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583, + 26437, 30261, 31073, 32475) } }, + { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023, + 25381, 29014, 30482, 31436) }, + { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423, + 27610, 29905, 31276, 31794) }, + { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405, + 24469, 27915, 29090, 30492) }, + { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825, + 24649, 29153, 31096, 32210) }, + { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516, + 26001, 29675, 30981, 31994) } }, + { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055, + 25729, 29538, 30305, 32077) }, + { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062, + 23219, 27743, 29211, 30907) }, + { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555, + 30467, 30794, 32086) }, + { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523, + 23878, 28975, 30287, 32252) }, + { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561, + 30072, 30737, 32463) } }, + { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419, + 25060, 29696, 30917, 32409) }, + { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468, + 25225, 29485, 31158, 32342) }, + { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605, + 29118, 30078, 32018) }, + { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743, + 30389, 31536, 32528) }, + { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718, + 25769, 29953, 30983, 32485) } }, + { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449, + 26219, 30214, 31150, 32477) }, + { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236, + 25380, 29653, 31143, 32277) }, + { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466, + 29900, 30523, 32261) }, + { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753, + 24615, 29489, 30883, 32482) }, + { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180, + 31355, 31802, 32593) } } + }; + +static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE( + 2 * MAX_ANGLE_DELTA + 1)] = { + { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) }, + { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) }, + { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) }, + { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) }, + { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) }, + { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) }, + { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) }, + { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) } +}; + +static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123, + 26606, 27418, 27945, 29228, 29685, 30349) }, + { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649, + 25527, 27364, 28152, 29701, 29984, 30852) }, + { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654, + 25136, 27073, 27830, 29360, 29730, 30659) }, + { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533, + 23703, 24804, 25352, 26575, 27016, 28049) } }; + +static const aom_cdf_prob + default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE( + UV_INTRA_MODES)] = { + { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, + 28244, 30059, 30941, 31961) }, + { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, + 28359, 29505, 29800, 31796) }, + { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, + 30764, 31777, 32029) }, + { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, + 28577, 30612, 31355, 32493) }, + { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, + 31101, 31744, 32363) }, + { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, + 29711, 31161, 31441, 32550) }, + { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, + 30245, 31837, 32342, 32667) }, + { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, + 29267, 30643, 31961, 32461) }, + { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, + 28443, 30388, 30767, 32416) }, + { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, + 23174, 28861, 30379, 32175) }, + { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, + 23527, 27053, 31397, 32148) }, + { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, + 22482, 25896, 26541, 31819) }, + { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, + 15255, 15753, 16039, 16606) } }, + { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, + 15986, 20086, 20995, 22455, 24212) }, + { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, + 22099, 24228, 24693, 27032, 29472) }, + { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, + 23138, 24256, 24703, 26679) }, + { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, + 21520, 22206, 23389, 24182) }, + { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, + 24911, 25380, 26027, 26376) }, + { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, + 24780, 25386, 26517, 27176) }, + { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, + 23188, 23763, 24455, 24940) }, + { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, + 22336, 23204, 23964, 24793) }, + { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, + 22494, 23139, 24764, 25989) }, + { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, + 15534, 20714, 21789, 23443, 24861) }, + { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, + 15902, 20102, 22696, 23774, 25838) }, + { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, + 15636, 19676, 20474, 23519, 25208) }, + { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, + 9875, 10521, 29048) } } + }; + +static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE( + EXT_PARTITION_TYPES)] = { + { AOM_CDF4(19132, 25510, 30392) }, + { AOM_CDF4(13928, 19855, 28540) }, + { AOM_CDF4(12522, 23679, 28629) }, + { AOM_CDF4(9896, 18783, 25853) }, + { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) }, + { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) }, + { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) }, + { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) }, + { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) }, + { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) }, + { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) }, + { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) }, + { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) }, + { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) }, + { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) }, + { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) }, + { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, + { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) }, + { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) }, + { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) }, +}; + +static const aom_cdf_prob default_intra_ext_tx_cdf + [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = { + { + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + }, + { + { + { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) }, + { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) }, + { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) }, + { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) }, + { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) }, + { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) }, + { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) }, + { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) }, + { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) }, + { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) }, + { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) }, + { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) }, + { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) }, + }, + { + { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) }, + { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) }, + { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) }, + { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) }, + { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) }, + { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) }, + { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) }, + { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) }, + { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) }, + { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) }, + { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) }, + { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) }, + { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) }, + }, + { + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + }, + { + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + }, + }, + { + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + { + { AOM_CDF5(1127, 12814, 22772, 27483) }, + { AOM_CDF5(145, 6761, 11980, 26667) }, + { AOM_CDF5(362, 5887, 11678, 16725) }, + { AOM_CDF5(385, 15213, 18587, 30693) }, + { AOM_CDF5(25, 2914, 23134, 27903) }, + { AOM_CDF5(60, 4470, 11749, 23991) }, + { AOM_CDF5(37, 3332, 14511, 21448) }, + { AOM_CDF5(157, 6320, 13036, 17439) }, + { AOM_CDF5(119, 6719, 12906, 29396) }, + { AOM_CDF5(47, 5537, 12576, 21499) }, + { AOM_CDF5(269, 6076, 11258, 23115) }, + { AOM_CDF5(83, 5615, 12001, 17228) }, + { AOM_CDF5(1968, 5556, 12023, 18547) }, + }, + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + }, + }; + +static const aom_cdf_prob + default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE( + TX_TYPES)] = { + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, + 22848, 23934, 25474, 27727, 28915, 30631) }, + { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, + 20408, 22517, 25010, 27116, 28856, 30749) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720) }, + }, + { + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, + 28526, 30529) }, + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(4167) }, + { AOM_CDF2(1998) }, + { AOM_CDF2(748) }, + }, + }; + +static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = { + AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294) +}; + +static const aom_cdf_prob + default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = { + { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700, + 32704, 32708, 32712, 32716, 32720, 32724) }, + { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620, + 32647, 32668, 32672, 32676, 32680, 32684) }, + { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673, + 32677, 32681, 32685, 32689, 32693, 32697) }, + { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708, + 32712, 32716, 32720, 32724, 32728, 32732) }, + { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394, + 32464, 32516, 32560, 32576, 32593, 32622) }, + { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144, + 32413, 32520, 32594, 32622, 32656, 32660) } + }; + +static const aom_cdf_prob + default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE( + SWITCHABLE_FILTERS)] = { + { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) }, + { AOM_CDF3(422, 2938) }, { AOM_CDF3(28244, 32608) }, + { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) }, + { AOM_CDF3(770, 1152) }, { AOM_CDF3(20889, 25637) }, + { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) }, + { AOM_CDF3(305, 2247) }, { AOM_CDF3(27403, 32636) }, + { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) }, + { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) } + }; + +static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) }, + { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } }; + +static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } }; + +static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) }, + { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } }; + +static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) } +}; + +static const aom_cdf_prob + default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE( + INTER_COMPOUND_MODES)] = { + { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) }, + { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, + { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, + { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, + { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, + { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, + { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, + { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) } + }; + +static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + 2)] = { { AOM_CDF2(16384) }, + { AOM_CDF2(26887) }, + { AOM_CDF2(27597) }, + { AOM_CDF2(30237) } }; + +static const aom_cdf_prob + default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(1875, 11082, 27332) }, + { AOM_CDF4(2473, 9996, 26388) }, + { AOM_CDF4(4238, 11537, 25926) } }; + +static const aom_cdf_prob + default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) }, + { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) }, + { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } + }; + +static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MASKED_COMPOUND_TYPES)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, + { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, + { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } +}; + +static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, + 17367, 18452, 19422, 22839, 26127, 29629) }, + { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, + 21332, 24520, 27470, 29456, 30529, 31656) }, + { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, + 16730, 18114, 19313, 22521, 26012, 29550) }, + { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, + 24284, 24985, 25684, 27259, 28883, 30911) }, + { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, + 25057, 27251, 29173, 30089, 30960, 31933) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) } }; + +static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) }, + { AOM_CDF3(4738, 24765) }, { AOM_CDF3(5391, 25528) }, + { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) }, + { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) }, + { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) }, + { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) }, + { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) }, + { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) }, + { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } }; + +static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(10437) }, { AOM_CDF2(9371) }, { AOM_CDF2(9301) }, + { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) }, + { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) }, + { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) }, + { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) }, + { AOM_CDF2(26879) } +}; + +static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS] + [CDF_SIZE(2)] = { + { AOM_CDF2(806) }, + { AOM_CDF2(16662) }, + { AOM_CDF2(20186) }, + { AOM_CDF2(26538) } + }; + +static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(26828) }, + { AOM_CDF2(24035) }, + { AOM_CDF2(12031) }, + { AOM_CDF2(10640) }, + { AOM_CDF2(2901) } }; + +static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(2)] = { + { AOM_CDF2(1198) }, + { AOM_CDF2(2070) }, + { AOM_CDF2(9166) }, + { AOM_CDF2(7499) }, + { AOM_CDF2(22475) } + }; + +static const aom_cdf_prob + default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - + 1][CDF_SIZE(2)] = { + { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } }, + { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } }, + { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } } + }; + +static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1] + [CDF_SIZE(2)] = { + { { AOM_CDF2(4897) }, + { AOM_CDF2(1555) }, + { AOM_CDF2(4236) }, + { AOM_CDF2(8650) }, + { AOM_CDF2(904) }, + { AOM_CDF2(1444) } }, + { { AOM_CDF2(16973) }, + { AOM_CDF2(16751) }, + { AOM_CDF2(19647) }, + { AOM_CDF2(24773) }, + { AOM_CDF2(11014) }, + { AOM_CDF2(15087) } }, + { { AOM_CDF2(29744) }, + { AOM_CDF2(30279) }, + { AOM_CDF2(31194) }, + { AOM_CDF2(31895) }, + { AOM_CDF2(26875) }, + { AOM_CDF2(30304) } } + }; + +static const aom_cdf_prob + default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = { + { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } }, + { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } }, + { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } } + }; + +static const aom_cdf_prob + default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = { + { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } }, + { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } }, + { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } } + }; + +static const aom_cdf_prob + default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { + { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) }, + { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) }, + { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) }, + { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) }, + { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) }, + { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) }, + { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) } + }; + +static const aom_cdf_prob + default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { + { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) }, + { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) }, + { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) }, + { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) }, + { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) }, + { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) }, + { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) } + }; + +static const aom_cdf_prob default_palette_y_mode_cdf + [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = { + { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } }, + { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } }, + { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } }, + { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } }, + { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } }, + { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } }, + { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } } + }; + +static const aom_cdf_prob + default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(32461) }, { AOM_CDF2(21488) } + }; + +static const aom_cdf_prob default_palette_y_color_index_cdf + [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { + { + { AOM_CDF2(28710) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(10553) }, + { AOM_CDF2(27036) }, + { AOM_CDF2(31603) }, + }, + { + { AOM_CDF3(27877, 30490) }, + { AOM_CDF3(11532, 25697) }, + { AOM_CDF3(6544, 30234) }, + { AOM_CDF3(23018, 28072) }, + { AOM_CDF3(31915, 32385) }, + }, + { + { AOM_CDF4(25572, 28046, 30045) }, + { AOM_CDF4(9478, 21590, 27256) }, + { AOM_CDF4(7248, 26837, 29824) }, + { AOM_CDF4(19167, 24486, 28349) }, + { AOM_CDF4(31400, 31825, 32250) }, + }, + { + { AOM_CDF5(24779, 26955, 28576, 30282) }, + { AOM_CDF5(8669, 20364, 24073, 28093) }, + { AOM_CDF5(4255, 27565, 29377, 31067) }, + { AOM_CDF5(19864, 23674, 26716, 29530) }, + { AOM_CDF5(31646, 31893, 32147, 32426) }, + }, + { + { AOM_CDF6(23132, 25407, 26970, 28435, 30073) }, + { AOM_CDF6(7443, 17242, 20717, 24762, 27982) }, + { AOM_CDF6(6300, 24862, 26944, 28784, 30671) }, + { AOM_CDF6(18916, 22895, 25267, 27435, 29652) }, + { AOM_CDF6(31270, 31550, 31808, 32059, 32353) }, + }, + { + { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) }, + { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) }, + { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) }, + { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) }, + { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) }, + }, + { + { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, + { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) }, + { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) }, + { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, + { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + }, + }; + +static const aom_cdf_prob default_palette_uv_color_index_cdf + [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { + { + { AOM_CDF2(29089) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(8713) }, + { AOM_CDF2(29257) }, + { AOM_CDF2(31610) }, + }, + { + { AOM_CDF3(25257, 29145) }, + { AOM_CDF3(12287, 27293) }, + { AOM_CDF3(7033, 27960) }, + { AOM_CDF3(20145, 25405) }, + { AOM_CDF3(30608, 31639) }, + }, + { + { AOM_CDF4(24210, 27175, 29903) }, + { AOM_CDF4(9888, 22386, 27214) }, + { AOM_CDF4(5901, 26053, 29293) }, + { AOM_CDF4(18318, 22152, 28333) }, + { AOM_CDF4(30459, 31136, 31926) }, + }, + { + { AOM_CDF5(22980, 25479, 27781, 29986) }, + { AOM_CDF5(8413, 21408, 24859, 28874) }, + { AOM_CDF5(2257, 29449, 30594, 31598) }, + { AOM_CDF5(19189, 21202, 25915, 28620) }, + { AOM_CDF5(31844, 32044, 32281, 32518) }, + }, + { + { AOM_CDF6(22217, 24567, 26637, 28683, 30548) }, + { AOM_CDF6(7307, 16406, 19636, 24632, 28424) }, + { AOM_CDF6(4441, 25064, 26879, 28942, 30919) }, + { AOM_CDF6(17210, 20528, 23319, 26750, 29582) }, + { AOM_CDF6(30674, 30953, 31396, 31735, 32207) }, + }, + { + { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) }, + { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) }, + { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) }, + { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) }, + { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) }, + }, + { + { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, + { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) }, + { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) }, + { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, + { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + }, + }; + +static const aom_cdf_prob + default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) }, + { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) }, + { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) }, + { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) }, + { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) }, + { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) }, + { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) } + }; + +static const aom_cdf_prob default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) } +}; + +static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } }; + +static const aom_cdf_prob + default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) }, + { AOM_CDF2(13259) }, { AOM_CDF2(9334) }, { AOM_CDF2(4644) } + }; + +static const aom_cdf_prob + default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) }, + { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) } + }; + +static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 30531) }; + +static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE( + FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) }; + +static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE( + 2)] = { { AOM_CDF2(4621) }, { AOM_CDF2(6743) }, { AOM_CDF2(5893) }, + { AOM_CDF2(7866) }, { AOM_CDF2(12551) }, { AOM_CDF2(9394) }, + { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) }, + { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) }, + { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }; + +static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE( + RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) }; + +static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 11570) }; + +static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 16855) }; + +static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = { + AOM_CDF4(28160, 32120, 32677) +}; + +static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE( + DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) } }; +static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = { + AOM_CDF4(28160, 32120, 32677) +}; + +static const aom_cdf_prob + default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = { + { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) } + }; + +static const aom_cdf_prob + default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE( + MAX_SEGMENTS)] = { + { + AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533), + }, + { + AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344), + }, + { + AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679), + }, + }; + +static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] + [CDF_SIZE(MAX_TX_DEPTH + 1)] = { + { { AOM_CDF2(19968) }, + { AOM_CDF2(19968) }, + { AOM_CDF2(24320) } }, + { { AOM_CDF3(12272, 30172) }, + { AOM_CDF3(12272, 30172) }, + { AOM_CDF3(18677, 30848) } }, + { { AOM_CDF3(12986, 15180) }, + { AOM_CDF3(12986, 15180) }, + { AOM_CDF3(24302, 25602) } }, + { { AOM_CDF3(5782, 11475) }, + { AOM_CDF3(5782, 11475) }, + { AOM_CDF3(16803, 22759) } }, + }; + +// Negative values are invalid +const int av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1] = { + -1, -1, 0, -1, -1, 4, 3, 2, 1 +}; + +int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, + int r, int c, int palette_size, + uint8_t *color_order, int *color_idx) { + assert(palette_size <= PALETTE_MAX_SIZE); + assert(r > 0 || c > 0); + + // Get color indices of neighbors. + int color_neighbors[NUM_PALETTE_NEIGHBORS]; + color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1; + color_neighbors[1] = + (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1; + color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1; + + // The +10 below should not be needed. But we get a warning "array subscript + // is above array bounds [-Werror=array-bounds]" without it, possibly due to + // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 + int scores[PALETTE_MAX_SIZE + 10] = { 0 }; + int i; + static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 }; + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + if (color_neighbors[i] >= 0) { + scores[color_neighbors[i]] += weights[i]; + } + } + + int inverse_color_order[PALETTE_MAX_SIZE]; + for (i = 0; i < PALETTE_MAX_SIZE; ++i) { + color_order[i] = i; + inverse_color_order[i] = i; + } + + // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small). + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + int max = scores[i]; + int max_idx = i; + for (int j = i + 1; j < palette_size; ++j) { + if (scores[j] > max) { + max = scores[j]; + max_idx = j; + } + } + if (max_idx != i) { + // Move the score at index 'max_idx' to index 'i', and shift the scores + // from 'i' to 'max_idx - 1' by 1. + const int max_score = scores[max_idx]; + const uint8_t max_color_order = color_order[max_idx]; + for (int k = max_idx; k > i; --k) { + scores[k] = scores[k - 1]; + color_order[k] = color_order[k - 1]; + inverse_color_order[color_order[k]] = k; + } + scores[i] = max_score; + color_order[i] = max_color_order; + inverse_color_order[color_order[i]] = i; + } + } + + if (color_idx != NULL) + *color_idx = inverse_color_order[color_map[r * stride + c]]; + + // Get hash value of context. + int color_index_ctx_hash = 0; + static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + color_index_ctx_hash += scores[i] * hash_multipliers[i]; + } + assert(color_index_ctx_hash > 0); + assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); + + // Lookup context from hash. + const int color_index_ctx = + av1_palette_color_index_context_lookup[color_index_ctx_hash]; + assert(color_index_ctx >= 0); + assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); + return color_index_ctx; +} + +void av1_init_mode_probs(FRAME_CONTEXT *fc) { + av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf); + av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf); + av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf); + av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf); + av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf); + av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf); + av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf); + av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf); + av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf); + av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf); + av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf); + av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf); + av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf); + av1_copy(fc->single_ref_cdf, default_single_ref_cdf); + av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf); + av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs); + av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs); + av1_copy(fc->newmv_cdf, default_newmv_cdf); + av1_copy(fc->zeromv_cdf, default_zeromv_cdf); + av1_copy(fc->refmv_cdf, default_refmv_cdf); + av1_copy(fc->drl_cdf, default_drl_cdf); + av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf); + av1_copy(fc->obmc_cdf, default_obmc_cdf); + av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf); + av1_copy(fc->compound_type_cdf, default_compound_type_cdf); + av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf); + av1_copy(fc->interintra_cdf, default_interintra_cdf); + av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf); + av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf); + av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf); + av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs); + av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf); + av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf); + av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf); + av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf); + av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf); + av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf); + av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf); + av1_copy(fc->partition_cdf, default_partition_cdf); + av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf); + av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf); + av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs); + av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs); + av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf); + for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++) + av1_copy(fc->seg.spatial_pred_seg_cdf[i], + default_spatial_pred_seg_tree_cdf[i]); + av1_copy(fc->tx_size_cdf, default_tx_size_cdf); + av1_copy(fc->delta_q_cdf, default_delta_q_cdf); + av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf); + av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf); + av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf); + av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf); + av1_copy(fc->intrabc_cdf, default_intrabc_cdf); +} + +void av1_set_default_ref_deltas(int8_t *ref_deltas) { + assert(ref_deltas != NULL); + + ref_deltas[INTRA_FRAME] = 1; + ref_deltas[LAST_FRAME] = 0; + ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[GOLDEN_FRAME] = -1; + ref_deltas[ALTREF2_FRAME] = -1; + ref_deltas[ALTREF_FRAME] = -1; +} + +void av1_set_default_mode_deltas(int8_t *mode_deltas) { + assert(mode_deltas != NULL); + + mode_deltas[0] = 0; + mode_deltas[1] = 0; +} + +static void set_default_lf_deltas(struct loopfilter *lf) { + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; + + av1_set_default_ref_deltas(lf->ref_deltas); + av1_set_default_mode_deltas(lf->mode_deltas); +} + +void av1_setup_frame_contexts(AV1_COMMON *cm) { + // Store the frame context into a special slot (not associated with any + // reference buffer), so that we can set up cm->pre_fc correctly later + // This function must ONLY be called when cm->fc has been initialized with + // default probs, either by av1_setup_past_independence or after manually + // initializing them + *cm->default_frame_context = *cm->fc; + // TODO(jack.haughton@argondesign.com): don't think this should be necessary, + // but could do with fuller testing + if (cm->tiles.large_scale) { + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, i); + if (buf != NULL) buf->frame_context = *cm->fc; + } + for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) + cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc; + } +} + +void av1_setup_past_independence(AV1_COMMON *cm) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + av1_clearall_segfeatures(&cm->seg); + + if (cm->cur_frame->seg_map) { + memset(cm->cur_frame->seg_map, 0, + (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols)); + } + + // reset mode ref deltas + av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); + av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); + set_default_lf_deltas(&cm->lf); + + av1_default_coef_probs(cm); + av1_init_mode_probs(cm->fc); + av1_init_mv_probs(cm); + cm->fc->initialized = 1; + av1_setup_frame_contexts(cm); +} diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h new file mode 100644 index 0000000000..09cd6bd1e9 --- /dev/null +++ b/third_party/aom/av1/common/entropymode.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ +#define AOM_AV1_COMMON_ENTROPYMODE_H_ + +#include "av1/common/entropy.h" +#include "av1/common/entropymv.h" +#include "av1/common/filter.h" +#include "av1/common/seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 + +#define TX_SIZE_CONTEXTS 3 + +#define INTER_OFFSET(mode) ((mode)-NEARESTMV) +#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV) + +// Number of possible contexts for a color index. +// As can be seen from av1_get_palette_color_index_context(), the possible +// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to +// a value from 0 to 4 using 'av1_palette_color_index_context_lookup' table. +#define PALETTE_COLOR_INDEX_CONTEXTS 5 + +// Palette Y mode context for a block is determined by number of neighboring +// blocks (top and/or left) using a palette for Y plane. So, possible Y mode' +// context values are: +// 0 if neither left nor top block uses palette for Y plane, +// 1 if exactly one of left or top block uses palette for Y plane, and +// 2 if both left and top blocks use palette for Y plane. +#define PALETTE_Y_MODE_CONTEXTS 3 + +// Palette UV mode context for a block is determined by whether this block uses +// palette for the Y plane. So, possible values are: +// 0 if this block doesn't use palette for Y plane. +// 1 if this block uses palette for Y plane (i.e. Y palette size > 0). +#define PALETTE_UV_MODE_CONTEXTS 2 + +// Map the number of pixels in a block size to a context +// 64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4) -> 0 +// 128(BLOCK_8X16, BLOCK_16x8) -> 1 +// ... +// 4096(BLOCK_64X64) -> 6 +#define PALATTE_BSIZE_CTXS 7 + +#define MAX_COLOR_CONTEXT_HASH 8 + +#define NUM_PALETTE_NEIGHBORS 3 // left, top-left and top. + +#define KF_MODE_CONTEXTS 5 + +struct AV1Common; + +typedef struct { + const int16_t *scan; + const int16_t *iscan; +} SCAN_ORDER; + +typedef struct frame_contexts { + aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] + [CDF_SIZE(2)]; + aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)]; + aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)]; + aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)]; + aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)]; + aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)]; + aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)]; + aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)]; + aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB] + [CDF_SIZE(3)]; + aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] + [CDF_SIZE(4)]; + aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] + [CDF_SIZE(BR_CDF_SIZE)]; + + aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]; + + aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS] + [CDF_SIZE(INTER_COMPOUND_MODES)]; + aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL] + [CDF_SIZE(MASKED_COMPOUND_TYPES)]; + aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]; + aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]; + aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS] + [CDF_SIZE(INTERINTRA_MODES)]; + aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]; + aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; + aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; + aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; + aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; + aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS] + [CDF_SIZE(2)]; + aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]; + nmv_context nmvc; + nmv_context ndvc; + aom_cdf_prob intrabc_cdf[CDF_SIZE(2)]; + struct segmentation_probs seg; + aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]; + aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]; + aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)]; + aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)]; + aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]; + aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES] + [CDF_SIZE(UV_INTRA_MODES)]; + aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)]; + aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS] + [CDF_SIZE(SWITCHABLE_FILTERS)]; + /* kf_y_cdf is discarded after use, so does not require persistent storage. + However, we keep it with the other CDFs in this struct since it needs to + be copied to each tile to support parallelism just like the others. + */ + aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS] + [CDF_SIZE(INTRA_MODES)]; + + aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES] + [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]; + + aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] + [CDF_SIZE(MAX_TX_DEPTH + 1)]; + aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)]; + aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)]; + aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)]; + aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [CDF_SIZE(TX_TYPES)]; + aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES] + [CDF_SIZE(TX_TYPES)]; + aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]; + aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)]; + int initialized; +} FRAME_CONTEXT; + +static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 }, + { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 }, + { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 }, +}; + +static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 }, + { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 }, +}; + +void av1_set_default_ref_deltas(int8_t *ref_deltas); +void av1_set_default_mode_deltas(int8_t *mode_deltas); +void av1_setup_frame_contexts(struct AV1Common *cm); +void av1_setup_past_independence(struct AV1Common *cm); + +// Returns (int)ceil(log2(n)). +static INLINE int av1_ceil_log2(int n) { + if (n < 2) return 0; + int i = 1; + unsigned int p = 2; + while (p < (unsigned int)n) { + i++; + p = p << 1; + } + return i; +} + +// Returns the context for palette color index at row 'r' and column 'c', +// along with the 'color_order' of neighbors and the 'color_idx'. +// The 'color_map' is a 2D array with the given 'stride'. +int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, + int r, int c, int palette_size, + uint8_t *color_order, int *color_idx); + +extern const int + av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPYMODE_H_ diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c new file mode 100644 index 0000000000..e1e42f2f18 --- /dev/null +++ b/third_party/aom/av1/common/entropymv.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/entropymv.h" + +static const nmv_context default_nmv_context = { + { AOM_CDF4(4096, 11264, 19328) }, // joints_cdf + { { + // Vertical component + { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, + 32762, 32767) }, // class_cdf // fp + { { AOM_CDF4(16384, 24576, 26624) }, + { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf + { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf + { AOM_CDF2(128 * 128) }, // sign_cdf + { AOM_CDF2(160 * 128) }, // class0_hp_cdf + { AOM_CDF2(128 * 128) }, // hp_cdf + { AOM_CDF2(216 * 128) }, // class0_cdf + { { AOM_CDF2(128 * 136) }, + { AOM_CDF2(128 * 140) }, + { AOM_CDF2(128 * 148) }, + { AOM_CDF2(128 * 160) }, + { AOM_CDF2(128 * 176) }, + { AOM_CDF2(128 * 192) }, + { AOM_CDF2(128 * 224) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 240) } }, // bits_cdf + }, + { + // Horizontal component + { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, + 32762, 32767) }, // class_cdf // fp + { { AOM_CDF4(16384, 24576, 26624) }, + { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf + { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf + { AOM_CDF2(128 * 128) }, // sign_cdf + { AOM_CDF2(160 * 128) }, // class0_hp_cdf + { AOM_CDF2(128 * 128) }, // hp_cdf + { AOM_CDF2(216 * 128) }, // class0_cdf + { { AOM_CDF2(128 * 136) }, + { AOM_CDF2(128 * 140) }, + { AOM_CDF2(128 * 148) }, + { AOM_CDF2(128 * 160) }, + { AOM_CDF2(128 * 176) }, + { AOM_CDF2(128 * 192) }, + { AOM_CDF2(128 * 224) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 240) } }, // bits_cdf + } }, +}; + +void av1_init_mv_probs(AV1_COMMON *cm) { + // NB: this sets CDFs too + cm->fc->nmvc = default_nmv_context; + cm->fc->ndvc = default_nmv_context; +} diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h new file mode 100644 index 0000000000..cddc80768c --- /dev/null +++ b/third_party/aom/av1/common/entropymv.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPYMV_H_ +#define AOM_AV1_COMMON_ENTROPYMV_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/prob.h" + +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; + +void av1_init_mv_probs(struct AV1Common *cm); + +#define MV_UPDATE_PROB 252 + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} UENUM1BYTE(MV_JOINT_TYPE); + +static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { + return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; +} + +static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { + return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; +} + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 11 +enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ +} UENUM1BYTE(MV_CLASS_TYPE); + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_BITS_CONTEXTS 6 +#define MV_FP_SIZE 4 + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +#define MV_IN_USE_BITS 14 +#define MV_UPP (1 << MV_IN_USE_BITS) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + +typedef struct { + aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)]; + aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)]; + aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)]; + aom_cdf_prob sign_cdf[CDF_SIZE(2)]; + aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)]; + aom_cdf_prob hp_cdf[CDF_SIZE(2)]; + aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)]; + aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)]; +} nmv_component; + +typedef struct { + aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)]; + nmv_component comps[2]; +} nmv_context; + +enum { + MV_SUBPEL_NONE = -1, + MV_SUBPEL_LOW_PRECISION = 0, + MV_SUBPEL_HIGH_PRECISION, +} SENUM1BYTE(MvSubpelPrecision); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPYMV_H_ diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h new file mode 100644 index 0000000000..b99a138675 --- /dev/null +++ b/third_party/aom/av1/common/enums.h @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENUMS_H_ +#define AOM_AV1_COMMON_ENUMS_H_ + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! @file */ + +/*!\cond */ + +// Max superblock size +#define MAX_SB_SIZE_LOG2 7 +#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2) +#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE) + +// Min superblock size +#define MIN_SB_SIZE_LOG2 6 + +// Pixels per Mode Info (MI) unit +#define MI_SIZE_LOG2 2 +#define MI_SIZE (1 << MI_SIZE_LOG2) + +// MI-units per max superblock (MI Block - MIB) +#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2) +#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2) + +// MI-units per min superblock +#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2) + +// Mask to extract MI offset within max MIB +#define MAX_MIB_MASK (MAX_MIB_SIZE - 1) + +// Maximum number of tile rows and tile columns +#define MAX_TILE_ROWS 64 +#define MAX_TILE_COLS 64 + +#define MAX_VARTX_DEPTH 2 + +#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2) +#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2) + +#define MAX_PALETTE_SQUARE (64 * 64) +// Maximum number of colors in a palette. +#define PALETTE_MAX_SIZE 8 +// Minimum number of colors in a palette. +#define PALETTE_MIN_SIZE 2 + +#define FRAME_OFFSET_BITS 5 +#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1) + +// 4 frame filter levels: y plane vertical, y plane horizontal, +// u plane, and v plane +#define FRAME_LF_COUNT 4 +#define DEFAULT_DELTA_LF_MULTI 0 +#define MAX_MODE_LF_DELTAS 2 + +#define DIST_PRECISION_BITS 4 +#define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16 + +#define PROFILE_BITS 3 +// The following three profiles are currently defined. +// Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only. +// Profile 1. 8-bit and 10-bit 4:4:4 +// Profile 2. 8-bit and 10-bit 4:2:2 +// 12-bit 4:0:0, 4:2:2 and 4:4:4 +// Since we have three bits for the profiles, it can be extended later. +enum { + PROFILE_0, + PROFILE_1, + PROFILE_2, + MAX_PROFILES, +} SENUM1BYTE(BITSTREAM_PROFILE); + +#define OP_POINTS_CNT_MINUS_1_BITS 5 +#define OP_POINTS_IDC_BITS 12 + +// Note: Some enums use the attribute 'packed' to use smallest possible integer +// type, so that we can save memory when they are used in structs/arrays. + +typedef enum ATTRIBUTE_PACKED { + BLOCK_4X4, + BLOCK_4X8, + BLOCK_8X4, + BLOCK_8X8, + BLOCK_8X16, + BLOCK_16X8, + BLOCK_16X16, + BLOCK_16X32, + BLOCK_32X16, + BLOCK_32X32, + BLOCK_32X64, + BLOCK_64X32, + BLOCK_64X64, + BLOCK_64X128, + BLOCK_128X64, + BLOCK_128X128, + BLOCK_4X16, + BLOCK_16X4, + BLOCK_8X32, + BLOCK_32X8, + BLOCK_16X64, + BLOCK_64X16, + BLOCK_SIZES_ALL, + BLOCK_SIZES = BLOCK_4X16, + BLOCK_INVALID = 255, + BLOCK_LARGEST = (BLOCK_SIZES - 1) +} BLOCK_SIZE; + +// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 +#define SQR_BLOCK_SIZES 6 + +// Partition types. R: Recursive +// +// NONE HORZ VERT SPLIT +// +-------+ +-------+ +---+---+ +---+---+ +// | | | | | | | | R | R | +// | | +-------+ | | | +---+---+ +// | | | | | | | | R | R | +// +-------+ +-------+ +---+---+ +---+---+ +// +// HORZ_A HORZ_B VERT_A VERT_B +// +---+---+ +-------+ +---+---+ +---+---+ +// | | | | | | | | | | | +// +---+---+ +---+---+ +---+ | | +---+ +// | | | | | | | | | | | +// +-------+ +---+---+ +---+---+ +---+---+ +// +// HORZ_4 VERT_4 +// +-----+ +-+-+-+ +// +-----+ | | | | +// +-----+ | | | | +// +-----+ +-+-+-+ +enum { + PARTITION_NONE, + PARTITION_HORZ, + PARTITION_VERT, + PARTITION_SPLIT, + PARTITION_HORZ_A, // HORZ split and the top partition is split again + PARTITION_HORZ_B, // HORZ split and the bottom partition is split again + PARTITION_VERT_A, // VERT split and the left partition is split again + PARTITION_VERT_B, // VERT split and the right partition is split again + PARTITION_HORZ_4, // 4:1 horizontal partition + PARTITION_VERT_4, // 4:1 vertical partition + EXT_PARTITION_TYPES, + PARTITION_TYPES = PARTITION_SPLIT + 1, + PARTITION_INVALID = 255 +} UENUM1BYTE(PARTITION_TYPE); + +typedef char PARTITION_CONTEXT; +#define PARTITION_PLOFFSET 4 // number of probability models per block size +#define PARTITION_BLOCK_SIZES 5 +#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET) + +#define TX_SIZE_LUMA_MIN (TX_4X4) +/* We don't need to code a transform size unless the allowed size is at least + one more than the minimum. */ +#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1) + +// Maximum tx_size categories +#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN) +#define MAX_TX_DEPTH 2 + +#define MAX_TX_SIZE_LOG2 (6) +#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2) +#define MIN_TX_SIZE_LOG2 2 +#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2) +#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE) + +// Pad 4 extra columns to remove horizontal availability check. +#define TX_PAD_HOR_LOG2 2 +#define TX_PAD_HOR 4 +// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability +// check. +#define TX_PAD_TOP 0 +#define TX_PAD_BOTTOM 4 +#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) +// Pad 16 extra bytes to avoid reading overflow in SIMD optimization. +#define TX_PAD_END 16 +#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END) + +// Number of maximum size transform blocks in the maximum size superblock +#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2) +#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2) + +// frame transform mode +enum { + ONLY_4X4, // use only 4x4 transform + TX_MODE_LARGEST, // transform size is the largest possible for pu size + TX_MODE_SELECT, // transform specified for each block + TX_MODES, +} UENUM1BYTE(TX_MODE); + +// 1D tx types +enum { + DCT_1D, + ADST_1D, + FLIPADST_1D, + IDTX_1D, + TX_TYPES_1D, +} UENUM1BYTE(TX_TYPE_1D); + +enum { + REG_REG, + REG_SMOOTH, + REG_SHARP, + SMOOTH_REG, + SMOOTH_SMOOTH, + SMOOTH_SHARP, + SHARP_REG, + SHARP_SMOOTH, + SHARP_SHARP, +} UENUM1BYTE(DUAL_FILTER_TYPE); + +#define EXT_TX_SIZES 4 // number of sizes that use extended transforms +#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER +#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA + +enum { + AOM_LAST_FLAG = 1 << 0, + AOM_LAST2_FLAG = 1 << 1, + AOM_LAST3_FLAG = 1 << 2, + AOM_GOLD_FLAG = 1 << 3, + AOM_BWD_FLAG = 1 << 4, + AOM_ALT2_FLAG = 1 << 5, + AOM_ALT_FLAG = 1 << 6, + AOM_REFFRAME_ALL = (1 << 7) - 1 +} UENUM1BYTE(AOM_REFFRAME); + +enum { + UNIDIR_COMP_REFERENCE, + BIDIR_COMP_REFERENCE, + COMP_REFERENCE_TYPES, +} UENUM1BYTE(COMP_REFERENCE_TYPE); + +enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); + +#define CFL_ALPHABET_SIZE_LOG2 4 +#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) +#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1) +#define CFL_INDEX_ZERO CFL_ALPHABET_SIZE +#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) +#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) + +enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE); + +enum { + CFL_SIGN_ZERO, + CFL_SIGN_NEG, + CFL_SIGN_POS, + CFL_SIGNS +} UENUM1BYTE(CFL_SIGN_TYPE); + +enum { + CFL_DISALLOWED, + CFL_ALLOWED, + CFL_ALLOWED_TYPES +} UENUM1BYTE(CFL_ALLOWED_TYPE); + +// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid +#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1) +// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8 +#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5) +// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8 +#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js)) + +// There is no context when the alpha for a given plane is zero. +// So there are 2 fewer contexts than joint signs. +#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS) +#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS) +// Also, the contexts are symmetric under swapping the planes. +#define CFL_CONTEXT_V(js) \ + (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS) + +enum { + PALETTE_MAP, + COLOR_MAP_TYPES, +} UENUM1BYTE(COLOR_MAP_TYPE); + +enum { + TWO_COLORS, + THREE_COLORS, + FOUR_COLORS, + FIVE_COLORS, + SIX_COLORS, + SEVEN_COLORS, + EIGHT_COLORS, + PALETTE_SIZES +} UENUM1BYTE(PALETTE_SIZE); + +enum { + PALETTE_COLOR_ONE, + PALETTE_COLOR_TWO, + PALETTE_COLOR_THREE, + PALETTE_COLOR_FOUR, + PALETTE_COLOR_FIVE, + PALETTE_COLOR_SIX, + PALETTE_COLOR_SEVEN, + PALETTE_COLOR_EIGHT, + PALETTE_COLORS +} UENUM1BYTE(PALETTE_COLOR); + +// Note: All directional predictors must be between V_PRED and D67_PRED (both +// inclusive). +enum { + DC_PRED, // Average of above and left pixels + V_PRED, // Vertical + H_PRED, // Horizontal + D45_PRED, // Directional 45 degree + D135_PRED, // Directional 135 degree + D113_PRED, // Directional 113 degree + D157_PRED, // Directional 157 degree + D203_PRED, // Directional 203 degree + D67_PRED, // Directional 67 degree + SMOOTH_PRED, // Combination of horizontal and vertical interpolation + SMOOTH_V_PRED, // Vertical interpolation + SMOOTH_H_PRED, // Horizontal interpolation + PAETH_PRED, // Predict from the direction of smallest gradient + NEARESTMV, + NEARMV, + GLOBALMV, + NEWMV, + // Compound ref compound modes + NEAREST_NEARESTMV, + NEAR_NEARMV, + NEAREST_NEWMV, + NEW_NEARESTMV, + NEAR_NEWMV, + NEW_NEARMV, + GLOBAL_GLOBALMV, + NEW_NEWMV, + MB_MODE_COUNT, + PRED_MODE_INVALID = MB_MODE_COUNT, + INTRA_MODE_START = DC_PRED, + INTRA_MODE_END = NEARESTMV, + DIR_MODE_START = V_PRED, + DIR_MODE_END = D67_PRED + 1, + INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START, + SINGLE_INTER_MODE_START = NEARESTMV, + SINGLE_INTER_MODE_END = NEAREST_NEARESTMV, + SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START, + COMP_INTER_MODE_START = NEAREST_NEARESTMV, + COMP_INTER_MODE_END = MB_MODE_COUNT, + COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START, + INTER_MODE_START = NEARESTMV, + INTER_MODE_END = MB_MODE_COUNT, + INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. + INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks +} UENUM1BYTE(PREDICTION_MODE); + +// TODO(ltrudeau) Do we really want to pack this? +// TODO(ltrudeau) Do we match with PREDICTION_MODE? +enum { + UV_DC_PRED, // Average of above and left pixels + UV_V_PRED, // Vertical + UV_H_PRED, // Horizontal + UV_D45_PRED, // Directional 45 degree + UV_D135_PRED, // Directional 135 degree + UV_D113_PRED, // Directional 113 degree + UV_D157_PRED, // Directional 157 degree + UV_D203_PRED, // Directional 203 degree + UV_D67_PRED, // Directional 67 degree + UV_SMOOTH_PRED, // Combination of horizontal and vertical interpolation + UV_SMOOTH_V_PRED, // Vertical interpolation + UV_SMOOTH_H_PRED, // Horizontal interpolation + UV_PAETH_PRED, // Predict from the direction of smallest gradient + UV_CFL_PRED, // Chroma-from-Luma + UV_INTRA_MODES, + UV_MODE_INVALID, // For uv_mode in inter blocks +} UENUM1BYTE(UV_PREDICTION_MODE); + +// Number of top model rd to store for pruning y modes in intra mode decision +#define TOP_INTRA_MODEL_COUNT 4 +// Total number of luma intra prediction modes (include both directional and +// non-directional modes) +// Because there are 8 directional modes, each has additional 6 delta angles. +#define LUMA_MODE_COUNT (PAETH_PRED - DC_PRED + 1 + 6 * 8) + +enum { + SIMPLE_TRANSLATION, + OBMC_CAUSAL, // 2-sided OBMC + WARPED_CAUSAL, // 2-sided WARPED + MOTION_MODES +} UENUM1BYTE(MOTION_MODE); + +enum { + II_DC_PRED, + II_V_PRED, + II_H_PRED, + II_SMOOTH_PRED, + INTERINTRA_MODES +} UENUM1BYTE(INTERINTRA_MODE); + +enum { + COMPOUND_AVERAGE, + COMPOUND_DISTWTD, + COMPOUND_WEDGE, + COMPOUND_DIFFWTD, + COMPOUND_TYPES, + MASKED_COMPOUND_TYPES = 2, +} UENUM1BYTE(COMPOUND_TYPE); + +enum { + FILTER_DC_PRED, + FILTER_V_PRED, + FILTER_H_PRED, + FILTER_D157_PRED, + FILTER_PAETH_PRED, + FILTER_INTRA_MODES, +} UENUM1BYTE(FILTER_INTRA_MODE); + +enum { + SEQ_LEVEL_2_0, + SEQ_LEVEL_2_1, + SEQ_LEVEL_2_2, + SEQ_LEVEL_2_3, + SEQ_LEVEL_3_0, + SEQ_LEVEL_3_1, + SEQ_LEVEL_3_2, + SEQ_LEVEL_3_3, + SEQ_LEVEL_4_0, + SEQ_LEVEL_4_1, + SEQ_LEVEL_4_2, + SEQ_LEVEL_4_3, + SEQ_LEVEL_5_0, + SEQ_LEVEL_5_1, + SEQ_LEVEL_5_2, + SEQ_LEVEL_5_3, + SEQ_LEVEL_6_0, + SEQ_LEVEL_6_1, + SEQ_LEVEL_6_2, + SEQ_LEVEL_6_3, + SEQ_LEVEL_7_0, + SEQ_LEVEL_7_1, + SEQ_LEVEL_7_2, + SEQ_LEVEL_7_3, + SEQ_LEVEL_8_0, + SEQ_LEVEL_8_1, + SEQ_LEVEL_8_2, + SEQ_LEVEL_8_3, + SEQ_LEVELS, + SEQ_LEVEL_MAX = 31, + SEQ_LEVEL_KEEP_STATS = 32, +} UENUM1BYTE(AV1_LEVEL); + +#define LEVEL_BITS 5 + +#define DIRECTIONAL_MODES 8 +#define MAX_ANGLE_DELTA 3 +#define ANGLE_STEP 3 + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define SKIP_MODE_CONTEXTS 3 + +#define COMP_INDEX_CONTEXTS 6 +#define COMP_GROUP_IDX_CONTEXTS 6 + +#define NMV_CONTEXTS 3 + +#define NEWMV_MODE_CONTEXTS 6 +#define GLOBALMV_MODE_CONTEXTS 2 +#define REFMV_MODE_CONTEXTS 6 +#define DRL_MODE_CONTEXTS 3 + +#define GLOBALMV_OFFSET 3 +#define REFMV_OFFSET 4 + +#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1) +#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1) +#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1) + +#define COMP_NEWMV_CTXS 5 +#define INTER_MODE_CONTEXTS 8 + +#define DELTA_Q_SMALL 3 +#define DELTA_Q_PROBS (DELTA_Q_SMALL) +#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4 +#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4 +#define DEFAULT_DELTA_Q_RES_DUCKY_ENCODE 4 + +#define DELTA_LF_SMALL 3 +#define DELTA_LF_PROBS (DELTA_LF_SMALL) +#define DEFAULT_DELTA_LF_RES 2 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define MAX_REF_MV_STACK_SIZE 8 +#define USABLE_REF_MV_STACK_SIZE 4 +#define REF_CAT_LEVEL 640 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 3 + +#define COMP_REF_TYPE_CONTEXTS 5 +#define UNI_COMP_REF_CONTEXTS 3 + +#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3) +typedef uint8_t TXFM_CONTEXT; + +// An enum for single reference types (and some derived values). +enum { + NONE_FRAME = -1, + INTRA_FRAME, + LAST_FRAME, + LAST2_FRAME, + LAST3_FRAME, + GOLDEN_FRAME, + BWDREF_FRAME, + ALTREF2_FRAME, + ALTREF_FRAME, + REF_FRAMES, + + // Extra/scratch reference frame. It may be: + // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or + // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()). + EXTREF_FRAME = REF_FRAMES, + + // Number of inter (non-intra) reference types. + INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1, + + // Number of forward (aka past) reference types. + FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1, + + // Number of backward (aka future) reference types. + BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1, + + SINGLE_REFS = FWD_REFS + BWD_REFS, +}; + +#define REF_FRAMES_LOG2 3 + +// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new +// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the +// encoder in the cpi->scaled_ref_buf array. +// The encoder uses FRAME_BUFFERS only in GOOD and REALTIME encoding modes. +// The decoder also uses FRAME_BUFFERS. +#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME) + +// During allintra encoding, one reference frame buffer is free to be used again +// only after another frame buffer is stored as the reference frame. Hence, it +// is necessary and sufficient to maintain only two reference frame buffers in +// this case. +#define FRAME_BUFFERS_ALLINTRA 2 + +#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME) +#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME) + +// Select all the decoded frame buffer slots +#define SELECT_ALL_BUF_SLOTS 0xFF + +enum { + LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME } + LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME } + LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME } + BWDREF_ALTREF_FRAMES, // { BWDREF_FRAME, ALTREF_FRAME } + LAST2_LAST3_FRAMES, // { LAST2_FRAME, LAST3_FRAME } + LAST2_GOLDEN_FRAMES, // { LAST2_FRAME, GOLDEN_FRAME } + LAST3_GOLDEN_FRAMES, // { LAST3_FRAME, GOLDEN_FRAME } + BWDREF_ALTREF2_FRAMES, // { BWDREF_FRAME, ALTREF2_FRAME } + ALTREF2_ALTREF_FRAMES, // { ALTREF2_FRAME, ALTREF_FRAME } + TOTAL_UNIDIR_COMP_REFS, + // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs + // that are explicitly signaled. + UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1, +} UENUM1BYTE(UNIDIR_COMP_REF); + +#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS) + +#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS) + +// NOTE: A limited number of unidirectional reference pairs can be signalled for +// compound prediction. The use of skip mode, on the other hand, makes it +// possible to have a reference pair not listed for explicit signaling. +#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS) + +// Note: It includes single and compound references. So, it can take values from +// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum. +typedef int8_t MV_REFERENCE_FRAME; + +/*!\endcond */ + +/*!\enum RestorationType + * \brief This enumeration defines various restoration types supported + */ +typedef enum { + RESTORE_NONE, /**< No restoration */ + RESTORE_WIENER, /**< Separable Wiener restoration */ + RESTORE_SGRPROJ, /**< Selfguided restoration */ + RESTORE_SWITCHABLE, /**< Switchable restoration */ + RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, /**< Num Switchable types */ + RESTORE_TYPES = 4, /**< Num Restore types */ +} RestorationType; + +/*!\cond */ +// Picture prediction structures (0-13 are predefined) in scalability metadata. +enum { + SCALABILITY_L1T2 = 0, + SCALABILITY_L1T3 = 1, + SCALABILITY_L2T1 = 2, + SCALABILITY_L2T2 = 3, + SCALABILITY_L2T3 = 4, + SCALABILITY_S2T1 = 5, + SCALABILITY_S2T2 = 6, + SCALABILITY_S2T3 = 7, + SCALABILITY_L2T1h = 8, + SCALABILITY_L2T2h = 9, + SCALABILITY_L2T3h = 10, + SCALABILITY_S2T1h = 11, + SCALABILITY_S2T2h = 12, + SCALABILITY_S2T3h = 13, + SCALABILITY_SS = 14 +} UENUM1BYTE(SCALABILITY_STRUCTURES); + +#define SUPERRES_SCALE_BITS 3 +#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1) + +// In large_scale_tile coding, external references are used. +#define MAX_EXTERNAL_REFERENCES 128 +#define MAX_TILES 512 + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENUMS_H_ diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h new file mode 100644 index 0000000000..4344aea916 --- /dev/null +++ b/third_party/aom/av1/common/filter.h @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_FILTER_H_ +#define AOM_AV1_COMMON_FILTER_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_FILTER_TAP 12 + +typedef enum ATTRIBUTE_PACKED { + EIGHTTAP_REGULAR, + EIGHTTAP_SMOOTH, + MULTITAP_SHARP, + BILINEAR, + // Encoder side only filters + MULTITAP_SHARP2, + + INTERP_FILTERS_ALL, + SWITCHABLE_FILTERS = BILINEAR, + SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */ + EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS, + INTERP_INVALID = 0xff, +} InterpFilter; + +enum { + USE_2_TAPS_ORIG = 0, // This is used in temporal filtering. + USE_2_TAPS, + USE_4_TAPS, + USE_8_TAPS, +} UENUM1BYTE(SUBPEL_SEARCH_TYPE); + +enum { + INTERP_EVAL_LUMA_EVAL_CHROMA = 0, + INTERP_SKIP_LUMA_EVAL_CHROMA, + INTERP_EVAL_INVALID, + INTERP_SKIP_LUMA_SKIP_CHROMA, +} UENUM1BYTE(INTERP_EVAL_PLANE); + +enum { + INTERP_HORZ_NEQ_VERT_NEQ = 0, + INTERP_HORZ_EQ_VERT_NEQ, + INTERP_HORZ_NEQ_VERT_EQ, + INTERP_HORZ_EQ_VERT_EQ, + INTERP_PRED_TYPE_ALL, +} UENUM1BYTE(INTERP_PRED_TYPE); +// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters, +// we can use 16 bits for each and have more than enough space. This reduces +// argument passing and unifies the operation of setting a (pair of) filters. +typedef struct InterpFilters { + uint16_t y_filter; + uint16_t x_filter; +} InterpFilters; + +typedef union int_interpfilters { + uint32_t as_int; + InterpFilters as_filters; +} int_interpfilters; + +static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters, + int dir) { + return (InterpFilter)((dir) ? filters.as_filters.x_filter + : filters.as_filters.y_filter); +} + +static INLINE int_interpfilters +av1_broadcast_interp_filter(InterpFilter filter) { + int_interpfilters filters; + filters.as_filters.x_filter = filter; + filters.as_filters.y_filter = filter; + return filters; +} + +static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) { + return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter; +} + +/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ +#define LOG_SWITCHABLE_FILTERS 2 + +#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4) +#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1) +#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2) +#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff) + +typedef struct InterpFilterParams { + const int16_t *filter_ptr; + uint16_t taps; + InterpFilter interp_filter; +} InterpFilterParams; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_bilinear_filters[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 }, + { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 }, + { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 }, + { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 }, + { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 }, + { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 }, + { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 }, + { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 }, + { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 }, + { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 }, + { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 }, + { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 }, + { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 }, + { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 }, + { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 }, + { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 } +}; + +DECLARE_ALIGNED(256, static const int16_t, + av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = { + { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }, + { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 }, + { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 }, + { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 }, + { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 }, + { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 }, + { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 }, + { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 }, + { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 }, + { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 }, + { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 }, + { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 }, + { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 }, + { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 }, + { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 }, + { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 } +}; + +static const InterpFilterParams + av1_interp_filter_params_list[INTERP_FILTERS_ALL] = { + { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS, + EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, + MULTITAP_SHARP }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }, + + // The following filters are for encoder only, and now they are used in + // temporal filtering. The predictor block size >= 16 in temporal filter. + { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 }, + }; + +// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel +// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. +DECLARE_ALIGNED(256, static const int16_t, + av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { + 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const InterpFilterParams av1_intrabc_filter_params = { + av1_intrabc_bilinear_filter, 2, BILINEAR +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 }, + { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 }, + { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 }, + { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 }, + { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 }, + { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } +}; + +static const uint16_t + av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = { + { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG), + (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH), + (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) }, + { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP), + (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP), + (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) } + }; + +// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR +static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = { + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, + EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }, +}; + +static INLINE const InterpFilterParams * +av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, + const int w) { + if (w <= 4 && interp_filter != MULTITAP_SHARP2) + return &av1_interp_4tap[interp_filter]; + return &av1_interp_filter_params_list[interp_filter]; +} + +static INLINE const int16_t *av1_get_interp_filter_kernel( + const InterpFilter interp_filter, int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + return (subpel_search == USE_2_TAPS) + ? av1_interp_4tap[BILINEAR].filter_ptr + : ((subpel_search == USE_4_TAPS) + ? av1_interp_4tap[interp_filter].filter_ptr + : av1_interp_filter_params_list[interp_filter].filter_ptr); +} + +static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( + const InterpFilterParams *const filter_params, const int subpel) { + return filter_params->filter_ptr + filter_params->taps * subpel; +} + +static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + + switch (subpel_search) { + case USE_2_TAPS: return &av1_interp_4tap[BILINEAR]; + case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR]; + case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR]; + default: assert(0); return NULL; + } +} + +static INLINE void reset_interp_filter_allowed_mask( + uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + uint16_t tmp = (~(1 << filt_type)) & 0xffff; + *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK); +} + +static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask, + DUAL_FILTER_TYPE filt_type) { + *allow_interp_mask |= (1 << filt_type); +} + +static INLINE uint8_t get_interp_filter_allowed_mask( + uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + return (allow_interp_mask >> filt_type) & 1; +} + +static AOM_INLINE int get_filter_tap( + const InterpFilterParams *const filter_params, int subpel_qn) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_qn & SUBPEL_MASK); + if (filter_params->taps == 12) { + return 12; + } + if (filter[0] | filter[7]) { + return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_FILTER_H_ diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c new file mode 100644 index 0000000000..f10ccd5942 --- /dev/null +++ b/third_party/aom/av1/common/frame_buffers.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/frame_buffers.h" +#include "aom_mem/aom_mem.h" + +int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + av1_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + list->int_fb = (InternalFrameBuffer *)aom_calloc( + list->num_internal_frame_buffers, sizeof(*list->int_fb)); + if (list->int_fb == NULL) { + list->num_internal_frame_buffers = 0; + return 1; + } + return 0; +} + +void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + aom_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + aom_free(list->int_fb); + list->int_fb = NULL; + list->num_internal_frame_buffers = 0; +} + +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + if (list->int_fb[i].data && !list->int_fb[i].in_use) + memset(list->int_fb[i].data, 0, list->int_fb[i].size); + } +} + +int av1_get_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) break; + } + + if (i == int_fb_list->num_internal_frame_buffers) return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + aom_free(int_fb_list->int_fb[i].data); + // The data must be zeroed to fix a valgrind error from the C loop filter + // due to access uninitialized memory in frame border. It could be + // skipped if border were totally removed. + int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size); + if (!int_fb_list->int_fb[i].data) { + int_fb_list->int_fb[i].size = 0; + return -1; + } + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + if (int_fb) int_fb->in_use = 0; + return 0; +} diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h new file mode 100644 index 0000000000..16188e51c7 --- /dev/null +++ b/third_party/aom/av1/common/frame_buffers.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_ +#define AOM_AV1_COMMON_FRAME_BUFFERS_H_ + +#include "aom/aom_frame_buffer.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void av1_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Zeros all unused internal frame buffers. In particular, this zeros the +// frame borders. Call this function after a sequence header change to +// re-initialize the frame borders for the different width, height, or bit +// depth. +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libaom to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int av1_get_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb); + +// Callback used by libaom when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_ diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c new file mode 100644 index 0000000000..bff438f3c6 --- /dev/null +++ b/third_party/aom/av1/common/idct.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" + +int av1_get_tx_scale(const TX_SIZE tx_size) { + const int pels = tx_size_2d[tx_size]; + // Largest possible pels is 4096 (64x64). + return (pels > 256) + (pels > 1024); +} + +// NOTE: The implementation of all inverses need to be aware of the fact +// that input and output could be the same buffer. + +// idct +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + av1_highbd_iwht4x4_16_add(input, dest, stride, bd); + else + av1_highbd_iwht4x4_1_add(input, dest, stride, bd); +} + +void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); +} + +void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); +} + +void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + assert(tx_type == DCT_DCT); + av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, int eob, int reduced_tx_set, + TxfmParam *txfm_param) { + (void)plane; + txfm_param->tx_type = tx_type; + txfm_param->tx_size = tx_size; + txfm_param->eob = eob; + txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id]; + txfm_param->bd = xd->bd; + txfm_param->is_hbd = is_cur_buf_hbd(xd); + txfm_param->tx_set_type = av1_get_ext_tx_set_type( + txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); +} + +void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param); + break; + case TX_64X64: + av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param); + break; + case TX_16X64: + av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param); + break; + case TX_64X16: + av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param); + break; + case TX_4X4: + // this is like av1_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param); + break; + default: assert(0 && "Invalid transform size"); break; + } +} + +void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_SIZE tx_size = txfm_param->tx_size; + DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]); + int tmp_stride = MAX_TX_SIZE; + int w = tx_size_wide[tx_size]; + int h = tx_size_high[tx_size]; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + tmp[r * tmp_stride + c] = dst[r * stride + c]; + } + } + + av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, + txfm_param); + + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c]; + } + } +} + +void av1_inverse_transform_block(const MACROBLOCKD *xd, + const tran_low_t *dqcoeff, int plane, + TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, + int stride, int eob, int reduced_tx_set) { + if (!eob) return; + + assert(eob <= av1_get_max_eob(tx_size)); + + TxfmParam txfm_param; + init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set, + &txfm_param); + assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]); + + if (txfm_param.is_hbd) { + av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + } else { + av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + } +} diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h new file mode 100644 index 0000000000..004d25d49a --- /dev/null +++ b/third_party/aom/av1/common/idct.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_IDCT_H_ +#define AOM_AV1_COMMON_IDCT_H_ + +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/enums.h" +#include "aom_dsp/txfm_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d)(const tran_low_t *, tran_low_t *); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#define MAX_TX_SCALE 1 +int av1_get_tx_scale(const TX_SIZE tx_size); + +void av1_inverse_transform_block(const MACROBLOCKD *xd, + const tran_low_t *dqcoeff, int plane, + TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, + int stride, int eob, int reduced_tx_set); +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); + +static INLINE const int32_t *cast_to_int32(const tran_low_t *input) { + assert(sizeof(int32_t) == sizeof(tran_low_t)); + return (const int32_t *)input; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_IDCT_H_ diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h new file mode 100644 index 0000000000..6828834e05 --- /dev/null +++ b/third_party/aom/av1/common/mv.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_MV_H_ +#define AOM_AV1_COMMON_MV_H_ + +#include + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_MV 0x80008000 +#define INVALID_MV_ROW_COL -32768 +#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3) +#define GET_MV_SUBPEL(x) ((x)*8) + +#define MARK_MV_INVALID(mv) \ + do { \ + ((int_mv *)(mv))->as_int = INVALID_MV; \ + } while (0) +#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col)) + +// The motion vector in units of full pixel +typedef struct fullpel_mv { + int16_t row; + int16_t col; +} FULLPEL_MV; + +// The motion vector in units of 1/8-pel +typedef struct mv { + int16_t row; + int16_t col; +} MV; + +static const MV kZeroMv = { 0, 0 }; +static const FULLPEL_MV kZeroFullMv = { 0, 0 }; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; + FULLPEL_MV as_fullmv; +} int_mv; /* facilitates faster equality tests and copies */ + +typedef struct mv32 { + int32_t row; + int32_t col; +} MV32; + +// The mv limit for fullpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} FullMvLimits; + +// The mv limit for subpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} SubpelMvLimits; + +static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) { + const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row), + (int16_t)GET_MV_RAWPEL(subpel_mv->col) }; + return full_mv; +} + +static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) { + const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row), + (int16_t)GET_MV_SUBPEL(full_mv->col) }; + return subpel_mv; +} + +static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) { + mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv); +} + +// Bits of precision used for the model +#define WARPEDMODEL_PREC_BITS 16 + +#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS) +#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3)) + +// Bits of subpel precision for warped interpolation +#define WARPEDPIXEL_PREC_BITS 6 +#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS) + +#define WARP_PARAM_REDUCE_BITS 6 + +#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) + +typedef struct { + int global_warp_allowed; + int local_warp_allowed; +} WarpTypesAllowed; + +// The order of values in the wmmat matrix below is best described +// by the affine transformation: +// [x' (m2 m3 m0 [x +// z . y' = m4 m5 m1 * y +// 1] 0 0 1) 1] +typedef struct { + int32_t wmmat[MAX_PARAMDIM]; + int16_t alpha, beta, gamma, delta; + TransformationType wmtype; + int8_t invalid; +} WarpedMotionParams; + +/* clang-format off */ +static const WarpedMotionParams default_warp_params = { + { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS) }, + 0, 0, 0, 0, + IDENTITY, + 0, +}; +/* clang-format on */ + +// The following constants describe the various precisions +// of different parameters in the global motion experiment. +// +// Given the general homography: +// [x' (a b c [x +// z . y' = d e f * y +// 1] g h i) 1] +// +// Constants using the name ALPHA here are related to parameters +// a, b, d, e. Constants using the name TRANS are related +// to parameters c and f. +// +// Anything ending in PREC_BITS is the number of bits of precision +// to maintain when converting from double to integer. +// +// The ABS parameters are used to create an upper and lower bound +// for each parameter. In other words, after a parameter is integerized +// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS). +// +// XXX_PREC_DIFF and XXX_DECODE_FACTOR +// are computed once here to prevent repetitive +// computation on the decoder side. These are +// to allow the global motion parameters to be encoded in a lower +// precision than the warped model precision. This means that they +// need to be changed to warped precision when they are decoded. +// +// XX_MIN, XX_MAX are also computed to avoid repeated computation + +#define SUBEXPFIN_K 3 +#define GM_TRANS_PREC_BITS 6 +#define GM_ABS_TRANS_BITS 12 +#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3) +#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS) +#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3) +#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF) +#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF) + +#define GM_ALPHA_PREC_BITS 15 +#define GM_ABS_ALPHA_BITS 12 +#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS) +#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF) + +#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS) +#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS) + +#define GM_TRANS_MIN -GM_TRANS_MAX +#define GM_ALPHA_MIN -GM_ALPHA_MAX + +static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) { + const int bw = block_size_wide[bs]; + return mi_col * MI_SIZE + bw / 2 - 1; +} + +static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) { + const int bh = block_size_high[bs]; + return mi_row * MI_SIZE + bh / 2 - 1; +} + +static INLINE int convert_to_trans_prec(int allow_hp, int coor) { + if (allow_hp) + return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3); + else + return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2; +} +static INLINE void integer_mv_precision(MV *mv) { + int mod = (mv->row % 8); + if (mod != 0) { + mv->row -= mod; + if (abs(mod) > 4) { + if (mod > 0) { + mv->row += 8; + } else { + mv->row -= 8; + } + } + } + + mod = (mv->col % 8); + if (mod != 0) { + mv->col -= mod; + if (abs(mod) > 4) { + if (mod > 0) { + mv->col += 8; + } else { + mv->col -= 8; + } + } + } +} +// Convert a global motion vector into a motion vector at the centre of the +// given block. +// +// The resulting motion vector will have three fractional bits of precision. If +// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and +// is_integer is true, the bottom three bits will be zero (so the motion vector +// represents an integer) +static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm, + int allow_hp, BLOCK_SIZE bsize, + int mi_col, int mi_row, + int is_integer) { + int_mv res; + + if (gm->wmtype == IDENTITY) { + res.as_int = 0; + return res; + } + + const int32_t *mat = gm->wmmat; + int x, y, tx, ty; + + if (gm->wmtype == TRANSLATION) { + // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16) + // bits of fractional precision. The offset for a translation is stored in + // entries 0 and 1. For translations, all but the top three (two if + // cm->features.allow_high_precision_mv is false) fractional bits are always + // zero. + // + // After the right shifts, there are 3 fractional bits of precision. If + // allow_hp is false, the bottom bit is always zero (so we don't need a + // call to convert_to_trans_prec here) + // + // Note: There is an AV1 specification bug here: + // + // gm->wmmat[0] is supposed to be the horizontal translation, and so should + // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical + // translation and so should go into res.as_mv.row + // + // However, in the spec, these assignments are accidentally reversed, and so + // we must keep this incorrect logic to match the spec. + // + // See also: https://crbug.com/aomedia/3328 + res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF; + res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF; + assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp)); + if (is_integer) { + integer_mv_precision(&res.as_mv); + } + return res; + } + + x = block_center_x(mi_col, bsize); + y = block_center_y(mi_row, bsize); + + if (gm->wmtype == ROTZOOM) { + assert(gm->wmmat[5] == gm->wmmat[2]); + assert(gm->wmmat[4] == -gm->wmmat[3]); + } + + const int xc = + (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0]; + const int yc = + mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1]; + tx = convert_to_trans_prec(allow_hp, xc); + ty = convert_to_trans_prec(allow_hp, yc); + + res.as_mv.row = ty; + res.as_mv.col = tx; + + if (is_integer) { + integer_mv_precision(&res.as_mv); + } + return res; +} + +static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) { + if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] && + gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) { + return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION); + } + if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4]) + return ROTZOOM; + else + return AFFINE; +} + +typedef struct candidate_mv { + int_mv this_mv; + int_mv comp_mv; +} CANDIDATE_MV; + +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + +static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_MV_H_ diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c new file mode 100644 index 0000000000..d8889f3eb3 --- /dev/null +++ b/third_party/aom/av1/common/mvref_common.c @@ -0,0 +1,1501 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/mvref_common.h" +#include "av1/common/warped_motion.h" + +// Although we assign 32 bit integers, all the values are strictly under 14 +// bits. +static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, + 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, + 1024, 963, 910, 862, 819, 780, 744, 712, + 682, 655, 630, 606, 585, 564, 546, 528 }; + +// TODO(jingning): Consider the use of lookup table for (num / den) +// altogether. +static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) { + den = AOMMIN(den, MAX_FRAME_DISTANCE); + num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) + : AOMMAX(num, -MAX_FRAME_DISTANCE); + const int mv_row = + ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); + const int mv_col = + ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); + const int clamp_max = MV_UPP - 1; + const int clamp_min = MV_LOW + 1; + output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max); + output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max); +} + +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis) { + const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); + MV_REF *frame_mvs = + cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); + x_mis = ROUND_POWER_OF_TWO(x_mis, 1); + y_mis = ROUND_POWER_OF_TWO(y_mis, 1); + int w, h; + + for (h = 0; h < y_mis; h++) { + MV_REF *mv = frame_mvs; + for (w = 0; w < x_mis; w++) { + mv->ref_frame = NONE_FRAME; + mv->mv.as_int = 0; + + for (int idx = 0; idx < 2; ++idx) { + MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx]; + if (ref_frame > INTRA_FRAME) { + int8_t ref_idx = cm->ref_frame_side[ref_frame]; + if (ref_idx) continue; + if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) || + (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT)) + continue; + mv->ref_frame = ref_frame; + mv->mv.as_int = mi->mv[idx].as_int; + } + } + mv++; + } + frame_mvs += frame_mvs_stride; + } +} + +static AOM_INLINE void add_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2], + uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, + CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params, + uint16_t weight) { + if (!is_inter_block(candidate)) return; + assert(weight % 2 == 0); + int index, ref; + + if (rf[1] == NONE_FRAME) { + // single reference frame + for (ref = 0; ref < 2; ++ref) { + if (candidate->ref_frame[ref] == rf[0]) { + const int is_gm_block = + is_global_mv_block(candidate, gm_params[rf[0]].wmtype); + const int_mv this_refmv = + is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref); + for (index = 0; index < *refmv_count; ++index) { + if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { + ref_mv_weight[index] += weight; + break; + } + } + + // Add a new item to the list. + if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[index].this_mv = this_refmv; + ref_mv_weight[index] = weight; + ++(*refmv_count); + } + if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; + ++*ref_match_count; + } + } + } else { + // compound reference frame + if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) { + int_mv this_refmv[2]; + + for (ref = 0; ref < 2; ++ref) { + if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype)) + this_refmv[ref] = gm_mv_candidates[ref]; + else + this_refmv[ref] = get_block_mv(candidate, ref); + } + + for (index = 0; index < *refmv_count; ++index) { + if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) && + (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { + ref_mv_weight[index] += weight; + break; + } + } + + // Add a new item to the list. + if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[index].this_mv = this_refmv[0]; + ref_mv_stack[index].comp_mv = this_refmv[1]; + ref_mv_weight[index] = weight; + ++(*refmv_count); + } + if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; + ++*ref_match_count; + } + } +} + +static AOM_INLINE void scan_row_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col, + const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, + int *processed_rows) { + int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); + const int width_8x8 = mi_size_wide[BLOCK_8X8]; + const int width_16x16 = mi_size_wide[BLOCK_16X16]; + int col_offset = 0; + // TODO(jingning): Revisit this part after cb4x4 is stable. + if (abs(row_offset) > 1) { + col_offset = 1; + if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset; + } + const int use_step_16 = (xd->width >= 16); + MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; + + for (int i = 0; i < end_mi;) { + const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; + const int candidate_bsize = candidate->bsize; + const int n4_w = mi_size_wide[candidate_bsize]; + int len = AOMMIN(xd->width, n4_w); + if (use_step_16) + len = AOMMAX(width_16x16, len); + else if (abs(row_offset) > 1) + len = AOMMAX(len, width_8x8); + + uint16_t weight = 2; + if (xd->width >= width_8x8 && xd->width <= n4_w) { + uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1, + mi_size_high[candidate_bsize]); + // Obtain range used in weight calculation. + weight = AOMMAX(weight, inc); + // Update processed rows. + *processed_rows = inc - row_offset - 1; + } + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); + + i += len; + } +} + +static AOM_INLINE void scan_col_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, + const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, + int *processed_cols) { + int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); + const int n8_h_8 = mi_size_high[BLOCK_8X8]; + const int n8_h_16 = mi_size_high[BLOCK_16X16]; + int i; + int row_offset = 0; + if (abs(col_offset) > 1) { + row_offset = 1; + if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset; + } + const int use_step_16 = (xd->height >= 16); + + for (i = 0; i < end_mi;) { + const MB_MODE_INFO *const candidate = + xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; + const int candidate_bsize = candidate->bsize; + const int n4_h = mi_size_high[candidate_bsize]; + int len = AOMMIN(xd->height, n4_h); + if (use_step_16) + len = AOMMAX(n8_h_16, len); + else if (abs(col_offset) > 1) + len = AOMMAX(len, n8_h_8); + + int weight = 2; + if (xd->height >= n8_h_8 && xd->height <= n4_h) { + int inc = AOMMIN(-max_col_offset + col_offset + 1, + mi_size_wide[candidate_bsize]); + // Obtain range used in weight calculation. + weight = AOMMAX(weight, inc); + // Update processed cols. + *processed_cols = inc - col_offset - 1; + } + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); + + i += len; + } +} + +static AOM_INLINE void scan_blk_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row, + const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, + int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, + uint8_t *refmv_count) { + const TileInfo *const tile = &xd->tile; + POSITION mi_pos; + + mi_pos.row = row_offset; + mi_pos.col = col_offset; + + if (is_inside(tile, mi_col, mi_row, &mi_pos)) { + const MB_MODE_INFO *const candidate = + xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col]; + const int len = mi_size_wide[BLOCK_8X8]; + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, 2 * len); + } // Analyze a single 8x8 block motion information. +} + +static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int mi_row, int mi_col, int bs) { + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; + const int mask_row = mi_row & (sb_mi_size - 1); + const int mask_col = mi_col & (sb_mi_size - 1); + + if (bs > mi_size_wide[BLOCK_64X64]) return 0; + + // In a split partition all apart from the bottom right has a top right + int has_tr = !((mask_row & bs) && (mask_col & bs)); + + // bs > 0 and bs is a power of 2 + assert(bs > 0 && !(bs & (bs - 1))); + + // For each 4x4 group of blocks, when the bottom right is decoded the blocks + // to the right have not been decoded therefore the bottom right does + // not have a top right + while (bs < sb_mi_size) { + if (mask_col & bs) { + if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) { + has_tr = 0; + break; + } + } else { + break; + } + bs <<= 1; + } + + // In a VERTICAL or VERTICAL_4 partition, all partition before the last one + // always have a top right (as the block above will have been decoded). + if (xd->width < xd->height) { + if (!xd->is_last_vertical_rect) has_tr = 1; + } + + // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one + // never have a top right (as the block to the right won't have been decoded). + if (xd->width > xd->height) { + if (!xd->is_first_horizontal_rect) has_tr = 0; + } + + // The bottom left square of a Vertical A (in the old format) does + // not have a top right as it is decoded before the right hand + // rectangle of the partition + if (xd->mi[0]->partition == PARTITION_VERT_A) { + if (xd->width == xd->height) + if (mask_row & bs) has_tr = 0; + } + + return has_tr; +} + +static int check_sb_border(const int mi_row, const int mi_col, + const int row_offset, const int col_offset) { + const int sb_mi_size = mi_size_wide[BLOCK_64X64]; + const int row = mi_row & (sb_mi_size - 1); + const int col = mi_col & (sb_mi_size - 1); + + if (row + row_offset < 0 || row + row_offset >= sb_mi_size || + col + col_offset < 0 || col + col_offset >= sb_mi_size) + return 0; + + return 1; +} + +static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame, + int blk_row, int blk_col, int_mv *gm_mv_candidates, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int16_t *mode_context) { + POSITION mi_pos; + mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; + mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; + + if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; + + const TPL_MV_REF *prev_frame_mvs = + cm->tpl_mvs + + ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) + + ((mi_col + mi_pos.col) >> 1); + if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0; + + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + + const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8]; + const int cur_frame_index = cm->cur_frame->order_hint; + const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); + const int frame0_index = buf_0->order_hint; + const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, frame0_index); + int idx; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + + int_mv this_refmv; + get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_0, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (rf[1] == NONE_FRAME) { + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); + } + } else { + // Process compound inter mode + const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); + const int frame1_index = buf_1->order_hint; + const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, frame1_index); + int_mv comp_refmv; + get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_1, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 || + abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 || + abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) { + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int && + comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int) + break; + } + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); + } + } + + return 1; +} + +static AOM_INLINE void process_compound_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], + int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; + + for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { + if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { + ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; + ++ref_id_count[cmp_idx]; + } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[can_rf] != + cm->ref_frame_sign_bias[rf[cmp_idx]]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; + ++ref_diff_count[cmp_idx]; + } + } + } +} + +static AOM_INLINE void process_single_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != + cm->ref_frame_sign_bias[ref_frame]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + int stack_idx; + for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) { + const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv; + if (this_mv.as_int == stack_mv.as_int) break; + } + + if (stack_idx == *refmv_count) { + ref_mv_stack[stack_idx].this_mv = this_mv; + + // TODO(jingning): Set an arbitrary small number here. The weight + // doesn't matter as long as it is properly initialized. + ref_mv_weight[stack_idx] = 2; + ++(*refmv_count); + } + } + } +} + +static AOM_INLINE void setup_ref_mv_list( + const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, + int mi_row, int mi_col, int16_t *mode_context) { + const int bs = AOMMAX(xd->width, xd->height); + const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); + MV_REFERENCE_FRAME rf[2]; + + const TileInfo *const tile = &xd->tile; + int max_row_offset = 0, max_col_offset = 0; + const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); + const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); + int processed_rows = 0; + int processed_cols = 0; + + av1_set_ref_frame(rf, ref_frame); + mode_context[ref_frame] = 0; + *refmv_count = 0; + + // Find valid maximum row/col offset. + if (xd->up_available) { + max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; + + if (xd->height < mi_size_high[BLOCK_8X8]) + max_row_offset = -(2 << 1) + row_adj; + + max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); + } + + if (xd->left_available) { + max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; + + if (xd->width < mi_size_wide[BLOCK_8X8]) + max_col_offset = -(2 << 1) + col_adj; + + max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); + } + + uint8_t col_match_count = 0; + uint8_t row_match_count = 0; + uint8_t newmv_count = 0; + + // Scan the first above row mode info. row_offset = -1; + if (abs(max_row_offset) >= 1) + scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &newmv_count, gm_mv_candidates, + max_row_offset, &processed_rows); + // Scan the first left column mode info. col_offset = -1; + if (abs(max_col_offset) >= 1) + scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &newmv_count, gm_mv_candidates, + max_col_offset, &processed_cols); + // Check top-right boundary + if (has_tr) + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack, + ref_mv_weight, &row_match_count, &newmv_count, + gm_mv_candidates, refmv_count); + + const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); + const uint8_t nearest_refmv_count = *refmv_count; + + // TODO(yunqing): for comp_search, do it for all 3 cases. + for (int idx = 0; idx < nearest_refmv_count; ++idx) + ref_mv_weight[idx] += REF_CAT_LEVEL; + + if (cm->features.allow_ref_frame_mvs) { + int is_available = 0; + const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height); + const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width); + const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]); + const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]); + + const int tpl_sample_pos[3][2] = { + { voffset, -2 }, + { voffset, hoffset }, + { voffset - 2, hoffset }, + }; + const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) && + (xd->height < mi_size_high[BLOCK_64X64]) && + (xd->width >= mi_size_wide[BLOCK_8X8]) && + (xd->width < mi_size_wide[BLOCK_64X64]); + + const int step_h = (xd->height >= mi_size_high[BLOCK_64X64]) + ? mi_size_high[BLOCK_16X16] + : mi_size_high[BLOCK_8X8]; + const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64]) + ? mi_size_wide[BLOCK_16X16] + : mi_size_wide[BLOCK_8X8]; + + for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) { + for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { + int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, + blk_col, gm_mv_candidates, refmv_count, + ref_mv_stack, ref_mv_weight, mode_context); + if (blk_row == 0 && blk_col == 0) is_available = ret; + } + } + + if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + + for (int i = 0; i < 3 && allow_extension; ++i) { + const int blk_row = tpl_sample_pos[i][0]; + const int blk_col = tpl_sample_pos[i][1]; + + if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue; + add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, + gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight, + mode_context); + } + } + + uint8_t dummy_newmv_count = 0; + + // Scan the second outer area. + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight, + &row_match_count, &dummy_newmv_count, gm_mv_candidates, + refmv_count); + + for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) { + const int row_offset = -(idx << 1) + 1 + row_adj; + const int col_offset = -(idx << 1) + 1 + col_adj; + + if (abs(row_offset) <= abs(max_row_offset) && + abs(row_offset) > processed_rows) + scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &dummy_newmv_count, + gm_mv_candidates, max_row_offset, &processed_rows); + + if (abs(col_offset) <= abs(max_col_offset) && + abs(col_offset) > processed_cols) + scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &dummy_newmv_count, + gm_mv_candidates, max_col_offset, &processed_cols); + } + + const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); + + switch (nearest_match) { + case 0: + if (ref_match_count >= 1) mode_context[ref_frame] |= 1; + if (ref_match_count == 1) + mode_context[ref_frame] |= (1 << REFMV_OFFSET); + else if (ref_match_count >= 2) + mode_context[ref_frame] |= (2 << REFMV_OFFSET); + break; + case 1: + mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3; + if (ref_match_count == 1) + mode_context[ref_frame] |= (3 << REFMV_OFFSET); + else if (ref_match_count >= 2) + mode_context[ref_frame] |= (4 << REFMV_OFFSET); + break; + case 2: + default: + if (newmv_count >= 1) + mode_context[ref_frame] |= 4; + else + mode_context[ref_frame] |= 5; + + mode_context[ref_frame] |= (5 << REFMV_OFFSET); + break; + } + + // Rank the likelihood and assign nearest and near mvs. + int len = nearest_refmv_count; + while (len > 0) { + int nr_len = 0; + for (int idx = 1; idx < len; ++idx) { + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; + nr_len = idx; + } + } + len = nr_len; + } + + len = *refmv_count; + while (len > nearest_refmv_count) { + int nr_len = nearest_refmv_count; + for (int idx = nearest_refmv_count + 1; idx < len; ++idx) { + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; + nr_len = idx; + } + } + len = nr_len; + } + + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width); + mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height); + mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row); + const int mi_size = AOMMIN(mi_width, mi_height); + if (rf[1] > NONE_FRAME) { + // TODO(jingning, yunqing): Refactor and consolidate the compound and + // single reference frame modes. Reduce unnecessary redundancy. + if (*refmv_count < MAX_MV_REF_CANDIDATES) { + int_mv ref_id[2][2], ref_diff[2][2]; + int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; + + for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { + const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_wide[candidate->bsize]; + } + + for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) { + const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_high[candidate->bsize]; + } + + // Build up the compound mv predictor + int_mv comp_list[MAX_MV_REF_CANDIDATES][2]; + + for (int idx = 0; idx < 2; ++idx) { + int comp_idx = 0; + for (int list_idx = 0; + list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; + ++list_idx, ++comp_idx) + comp_list[comp_idx][idx] = ref_id[idx][list_idx]; + for (int list_idx = 0; + list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; + ++list_idx, ++comp_idx) + comp_list[comp_idx][idx] = ref_diff[idx][list_idx]; + for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx) + comp_list[comp_idx][idx] = gm_mv_candidates[idx]; + } + + if (*refmv_count) { + assert(*refmv_count == 1); + if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int && + comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) { + ref_mv_stack[*refmv_count].this_mv = comp_list[1][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1]; + } else { + ref_mv_stack[*refmv_count].this_mv = comp_list[0][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1]; + } + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; + } else { + for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { + ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1]; + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; + } + } + } + + assert(*refmv_count >= 2); + + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + } + } else { + // Handle single reference frame extension + for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && + *refmv_count < MAX_MV_REF_CANDIDATES;) { + const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack, ref_mv_weight); + idx += mi_size_wide[candidate->bsize]; + } + + for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && + *refmv_count < MAX_MV_REF_CANDIDATES;) { + const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack, ref_mv_weight); + idx += mi_size_high[candidate->bsize]; + } + + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + } + + if (mv_ref_list != NULL) { + for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx) + mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int; + + for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); + ++idx) { + mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int; + } + } + } +} + +void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], + int_mv *global_mvs, int16_t *mode_context) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int_mv gm_mv[2]; + + if (ref_frame == INTRA_FRAME) { + gm_mv[0].as_int = gm_mv[1].as_int = 0; + if (global_mvs != NULL) { + global_mvs[ref_frame].as_int = INVALID_MV; + } + } else { + const BLOCK_SIZE bsize = mi->bsize; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + if (ref_frame < REF_FRAMES) { + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1].as_int = 0; + if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0]; + } else { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + } + } + + setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame], + ref_mv_stack[ref_frame], ref_mv_weight[ref_frame], + mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row, + mi_col, mode_context); +} + +void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv, int is_integer) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer); + } + *nearest_mv = mvlist[0]; + *near_mv = mvlist[1]; +} + +void av1_setup_frame_buf_refs(AV1_COMMON *cm) { + cm->cur_frame->order_hint = cm->current_frame.order_hint; + cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint; + cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level; + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint; + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] = + buf->display_order_hint; + } + } +} + +void av1_setup_frame_sign_bias(AV1_COMMON *cm) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) { + const int ref_order_hint = buf->order_hint; + cm->ref_frame_sign_bias[ref_frame] = + (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint, + (int)cm->current_frame.order_hint) <= 0) + ? 0 + : 1; + } else { + cm->ref_frame_sign_bias[ref_frame] = 0; + } + } +} + +#define MAX_OFFSET_WIDTH 64 +#define MAX_OFFSET_HEIGHT 0 + +static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, + int blk_col, MV mv, int sign_bias) { + const int base_blk_row = (blk_row >> 3) << 3; + const int base_blk_col = (blk_col >> 3) << 3; + + const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2)) + : -((-mv.row) >> (4 + MI_SIZE_LOG2)); + + const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2)) + : -((-mv.col) >> (4 + MI_SIZE_LOG2)); + + const int row = + (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; + const int col = + (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; + + if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 || + col >= (cm->mi_params.mi_cols >> 1)) + return 0; + + if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) || + row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) || + col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) || + col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3)) + return 0; + + *mi_r = row; + *mi_c = col; + + return 1; +} + +// Note: motion_filed_projection finds motion vectors of current frame's +// reference frame, and projects them to current frame. To make it clear, +// let's call current frame's reference frame as start frame. +// Call Start frame's reference frames as reference frames. +// Call ref_offset as frame distances between start frame and its reference +// frames. +static int motion_field_projection(AV1_COMMON *cm, + MV_REFERENCE_FRAME start_frame, int dir) { + TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; + int ref_offset[REF_FRAMES] = { 0 }; + + const RefCntBuffer *const start_frame_buf = + get_ref_frame_buf(cm, start_frame); + if (start_frame_buf == NULL) return 0; + + if (start_frame_buf->frame_type == KEY_FRAME || + start_frame_buf->frame_type == INTRA_ONLY_FRAME) + return 0; + + if (start_frame_buf->mi_rows != cm->mi_params.mi_rows || + start_frame_buf->mi_cols != cm->mi_params.mi_cols) + return 0; + + const int start_frame_order_hint = start_frame_buf->order_hint; + const unsigned int *const ref_order_hints = + &start_frame_buf->ref_order_hints[0]; + const int cur_order_hint = cm->cur_frame->order_hint; + int start_to_current_frame_offset = get_relative_dist( + &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint); + + for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { + ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info, + start_frame_order_hint, + ref_order_hints[rf - LAST_FRAME]); + } + + if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; + + MV_REF *mv_ref_base = start_frame_buf->mvs; + const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1; + const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1; + + for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) { + for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) { + MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col]; + MV fwd_mv = mv_ref->mv.as_mv; + + if (mv_ref->ref_frame > INTRA_FRAME) { + int_mv this_mv; + int mi_r, mi_c; + const int ref_frame_offset = ref_offset[mv_ref->ref_frame]; + + int pos_valid = + abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && + ref_frame_offset > 0 && + abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE; + + if (pos_valid) { + get_mv_projection(&this_mv.as_mv, fwd_mv, + start_to_current_frame_offset, ref_frame_offset); + pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col, + this_mv.as_mv, dir >> 1); + } + + if (pos_valid) { + const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c; + + tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; + tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; + tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset; + } + } + } + } + + return 1; +} + +// cm->ref_frame_side is calculated here, and will be used in +// av1_copy_frame_mvs() to affect how mvs are copied. +void av1_calculate_ref_frame_side(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; + + memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side)); + if (!order_hint_info->enable_order_hint) return; + + const int cur_order_hint = cm->cur_frame->order_hint; + + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + int order_hint = 0; + + if (buf != NULL) order_hint = buf->order_hint; + + if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0) + cm->ref_frame_side[ref_frame] = 1; + else if (order_hint == cur_order_hint) + cm->ref_frame_side[ref_frame] = -1; + } +} + +void av1_setup_motion_field(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; + + if (!order_hint_info->enable_order_hint) return; + + TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; + int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) * + (cm->mi_params.mi_stride >> 1); + for (int idx = 0; idx < size; ++idx) { + tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV; + tpl_mvs_base[idx].ref_frame_offset = 0; + } + + const int cur_order_hint = cm->cur_frame->order_hint; + const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME]; + int ref_order_hint[INTER_REFS_PER_FRAME]; + + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + const int ref_idx = ref_frame - LAST_FRAME; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + int order_hint = 0; + + if (buf != NULL) order_hint = buf->order_hint; + + ref_buf[ref_idx] = buf; + ref_order_hint[ref_idx] = order_hint; + } + + int ref_stamp = MFMV_STACK_SIZE - 1; + + if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) { + const int alt_of_lst_order_hint = + ref_buf[LAST_FRAME - LAST_FRAME] + ->ref_order_hints[ALTREF_FRAME - LAST_FRAME]; + + const int is_lst_overlay = + (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]); + if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2); + --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[BWDREF_FRAME - LAST_FRAME], + cur_order_hint) > 0) { + if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF2_FRAME - LAST_FRAME], + cur_order_hint) > 0) { + if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF_FRAME - LAST_FRAME], + cur_order_hint) > 0 && + ref_stamp >= 0) + if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp; + + if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2); +} + +static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts, + int *pts_inref, int row_offset, int sign_r, + int col_offset, int sign_c) { + const int bw = block_size_wide[mbmi->bsize]; + const int bh = block_size_high[mbmi->bsize]; + const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1; + const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1; + + pts[0] = GET_MV_SUBPEL(x); + pts[1] = GET_MV_SUBPEL(y); + pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col; + pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row; +} + +// Select samples according to the motion vector difference. +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int thresh = clamp(AOMMAX(bw, bh), 16, 112); + uint8_t ret = 0; + assert(len <= LEAST_SQUARES_SAMPLES_MAX); + + // Only keep the samples with MV differences within threshold. + for (int i = 0; i < len; ++i) { + const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) + + abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row); + if (diff > thresh) continue; + if (ret != i) { + memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0])); + memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0])); + } + ++ret; + } + // Keep at least 1 sample. + return AOMMAX(ret, 1); +} + +// Note: Samples returned are at 1/8-pel precision +// Sample are the neighbor block center point's coordinates relative to the +// left-top pixel of current block. +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref) { + const MB_MODE_INFO *const mbmi0 = xd->mi[0]; + const int ref_frame = mbmi0->ref_frame[0]; + const int up_available = xd->up_available; + const int left_available = xd->left_available; + uint8_t np = 0; + int do_tl = 1; + int do_tr = 1; + const int mi_stride = xd->mi_stride; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // scan the nearest above rows + if (up_available) { + const int mi_row_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride]; + uint8_t superblock_width = mi_size_wide[mbmi->bsize]; + + if (xd->width <= superblock_width) { + // Handle "current block width <= above block width" case. + const int col_offset = -mi_col % superblock_width; + + if (col_offset < 0) do_tl = 0; + if (col_offset + superblock_width > xd->width) do_tr = 0; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } else { + // Handle "current block width > above block width" case. + for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + i += superblock_width) { + mbmi = xd->mi[i + mi_row_offset * mi_stride]; + superblock_width = mi_size_wide[mbmi->bsize]; + + if (mbmi->ref_frame[0] == ref_frame && + mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, i, 1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) + return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // scan the nearest left columns + if (left_available) { + const int mi_col_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; + uint8_t superblock_height = mi_size_high[mbmi->bsize]; + + if (xd->height <= superblock_height) { + // Handle "current block height <= above block height" case. + const int row_offset = -mi_row % superblock_height; + + if (row_offset < 0) do_tl = 0; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } else { + // Handle "current block height > above block height" case. + for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + i += superblock_height) { + mbmi = xd->mi[mi_col_offset + i * mi_stride]; + superblock_height = mi_size_high[mbmi->bsize]; + + if (mbmi->ref_frame[0] == ref_frame && + mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, i, 1, 0, -1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) + return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // Top-left block + if (do_tl && left_available && up_available) { + const int mi_row_offset = -1; + const int mi_col_offset = -1; + MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride]; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // Top-right block + if (do_tr && + has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) { + const POSITION trb_pos = { -1, xd->width }; + const TileInfo *const tile = &xd->tile; + if (is_inside(tile, mi_col, mi_row, &trb_pos)) { + const int mi_row_offset = -1; + const int mi_col_offset = xd->width; + const MB_MODE_INFO *mbmi = + xd->mi[mi_col_offset + mi_row_offset * mi_stride]; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1); + if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + return np; +} + +void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; + SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->ref_frame_idx_0 = INVALID_IDX; + skip_mode_info->ref_frame_idx_1 = INVALID_IDX; + + if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) || + cm->current_frame.reference_mode == SINGLE_REFERENCE) + return; + + const int cur_order_hint = cm->current_frame.order_hint; + int ref_order_hints[2] = { -1, INT_MAX }; + int ref_idx[2] = { INVALID_IDX, INVALID_IDX }; + + // Identify the nearest forward and backward references. + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) < + 0) { + // Forward reference + if (ref_order_hints[0] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) > 0) { + ref_order_hints[0] = ref_order_hint; + ref_idx[0] = i; + } + } else if (get_relative_dist(order_hint_info, ref_order_hint, + cur_order_hint) > 0) { + // Backward reference + if (ref_order_hints[1] == INT_MAX || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) < 0) { + ref_order_hints[1] = ref_order_hint; + ref_idx[1] = i; + } + } + } + + if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) { + // == Bi-directional prediction == + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) { + // == Forward prediction only == + // Identify the second nearest forward reference. + ref_order_hints[1] = -1; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if ((ref_order_hints[0] != -1 && + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) < 0) && + (ref_order_hints[1] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) > 0)) { + // Second closest forward reference + ref_order_hints[1] = ref_order_hint; + ref_idx[1] = i; + } + } + if (ref_order_hints[1] != -1) { + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + } + } +} + +typedef struct { + int map_idx; // frame map index + RefCntBuffer *buf; // frame buffer + int sort_idx; // index based on the offset to be used for sorting +} REF_FRAME_INFO; + +// Compares the sort_idx fields. If they are equal, then compares the map_idx +// fields to break the tie. This ensures a stable sort. +static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { + const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a; + const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b; + + const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx; + if (sort_idx_diff != 0) return sort_idx_diff; + return info_a->map_idx - info_b->map_idx; +} + +static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx, + REF_FRAME_INFO *ref_info) { + assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); + + remapped_ref_idx[frame_idx] = ref_info->map_idx; +} + +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx) { + int lst_frame_sort_idx = -1; + int gld_frame_sort_idx = -1; + + assert(cm->seq_params->order_hint_info.enable_order_hint); + assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0); + const int cur_order_hint = (int)cm->current_frame.order_hint; + const int cur_frame_sort_idx = + 1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1; + + REF_FRAME_INFO ref_frame_info[REF_FRAMES]; + int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 }; + + for (int i = 0; i < REF_FRAMES; ++i) { + const int map_idx = i; + + ref_frame_info[i].map_idx = map_idx; + ref_frame_info[i].sort_idx = -1; + + RefCntBuffer *const buf = cm->ref_frame_map[map_idx]; + ref_frame_info[i].buf = buf; + + if (buf == NULL) continue; + // If this assertion fails, there is a reference leak. + assert(buf->ref_count > 0); + + const int offset = (int)buf->order_hint; + ref_frame_info[i].sort_idx = + (offset == -1) ? -1 + : cur_frame_sort_idx + + get_relative_dist(&cm->seq_params->order_hint_info, + offset, cur_order_hint); + assert(ref_frame_info[i].sort_idx >= -1); + + if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx; + if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx; + } + + // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference + // frames. + if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) { + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests a look-ahead frame as LAST"); + } + if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) { + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests a look-ahead frame as GOLDEN"); + } + + // Sort ref frames based on their frame_offset values. + qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO), + compare_ref_frame_info); + + // Identify forward and backward reference frames. + // Forward reference: offset < order_hint + // Backward reference: offset >= order_hint + int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1; + + for (int i = 0; i < REF_FRAMES; i++) { + if (ref_frame_info[i].sort_idx == -1) { + fwd_start_idx++; + continue; + } + + if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) { + fwd_end_idx = i - 1; + break; + } + } + + int bwd_start_idx = fwd_end_idx + 1; + int bwd_end_idx = REF_FRAMES - 1; + + // === Backward Reference Frames === + + // == ALTREF_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME, + &ref_frame_info[bwd_end_idx]); + ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1; + bwd_end_idx--; + } + + // == BWDREF_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME, + &ref_frame_info[bwd_start_idx]); + ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1; + bwd_start_idx++; + } + + // == ALTREF2_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME, + &ref_frame_info[bwd_start_idx]); + ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1; + } + + // === Forward Reference Frames === + + for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) { + // == LAST_FRAME == + if (ref_frame_info[i].map_idx == lst_map_idx) { + set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME, + &ref_frame_info[i]); + ref_flag_list[LAST_FRAME - LAST_FRAME] = 1; + } + + // == GOLDEN_FRAME == + if (ref_frame_info[i].map_idx == gld_map_idx) { + set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME, + &ref_frame_info[i]); + ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1; + } + } + + assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 && + ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1); + + // == LAST2_FRAME == + // == LAST3_FRAME == + // == BWDREF_FRAME == + // == ALTREF2_FRAME == + // == ALTREF_FRAME == + + // Set up the reference frames in the anti-chronological order. + static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = { + LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME + }; + + int ref_idx; + for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { + const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; + + if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; + + while (fwd_start_idx <= fwd_end_idx && + (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx || + ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) { + fwd_end_idx--; + } + if (fwd_start_idx > fwd_end_idx) break; + + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, + &ref_frame_info[fwd_end_idx]); + ref_flag_list[ref_frame - LAST_FRAME] = 1; + + fwd_end_idx--; + } + + // Assign all the remaining frame(s), if any, to the earliest reference + // frame. + for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { + const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; + if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, + &ref_frame_info[fwd_start_idx]); + ref_flag_list[ref_frame - LAST_FRAME] = 1; + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + assert(ref_flag_list[i] == 1); + } +} diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h new file mode 100644 index 0000000000..3ab784c1ed --- /dev/null +++ b/third_party/aom/av1/common/mvref_common.h @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ +#define AOM_AV1_COMMON_MVREF_COMMON_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MVREF_ROW_COLS 3 + +// Set the upper limit of the motion vector component magnitude. +// This would make a motion vector fit in 26 bits. Plus 3 bits for the +// reference frame index. A tuple of motion vector can hence be stored within +// 32 bit range for efficient load/store operations. +#define REFMVS_LIMIT ((1 << 12) - 1) + +typedef struct position { + int row; + int col; +} POSITION; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) { + if (!oh->enable_order_hint) return 0; + + const int bits = oh->order_hint_bits_minus_1 + 1; + + assert(bits >= 1); + assert(a >= 0 && a < (1 << bits)); + assert(b >= 0 && b < (1 << bits)); + + int diff = a - b; + const int m = 1 << (bits - 1); + diff = (diff & (m - 1)) - (diff & m); + return diff; +} + +static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) { + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER, + xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER, + xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER, + xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER + }; + clamp_mv(mv, &mv_limits); +} + +static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) { + return candidate->mv[which_mv]; +} + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < tile->mi_row_start || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= tile->mi_row_end || + mi_col + mi_pos->col >= tile->mi_col_end); +} + +static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row, + int row_offset) { + return clamp(row_offset, tile->mi_row_start - mi_row, + tile->mi_row_end - mi_row - 1); +} + +static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col, + int col_offset) { + return clamp(col_offset, tile->mi_col_start - mi_col, + tile->mi_col_end - mi_col - 1); +} + +static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) { + if (is_integer) { + integer_mv_precision(mv); + } else { + if (!allow_hp) { + if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1); + } + } +} + +static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) { + // Single ref pred + if (rf[1] <= INTRA_FRAME) return -1; + + // Bi-directional comp ref pred + if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1; + + for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) { + if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx)) + return ref_idx; + } + return -1; +} + +static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) { + if (rf[1] > INTRA_FRAME) { + const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf); + if (uni_comp_ref_idx >= 0) { + assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) < + MODE_CTX_REF_FRAMES); + return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx; + } else { + return REF_FRAMES + FWD_RF_OFFSET(rf[0]) + + BWD_RF_OFFSET(rf[1]) * FWD_REFS; + } + } + + return rf[0]; +} + +// clang-format off +static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = { + { LAST_FRAME, BWDREF_FRAME }, { LAST2_FRAME, BWDREF_FRAME }, + { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME }, + + { LAST_FRAME, ALTREF2_FRAME }, { LAST2_FRAME, ALTREF2_FRAME }, + { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME }, + + { LAST_FRAME, ALTREF_FRAME }, { LAST2_FRAME, ALTREF_FRAME }, + { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + + { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + + // NOTE: Following reference frame pairs are not supported to be explicitly + // signalled, but they are possibly chosen by the use of skip_mode, + // which may use the most recent one-sided reference frame pair. + { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME }, + { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME}, + { ALTREF2_FRAME, ALTREF_FRAME } +}; +// clang-format on + +static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf, + MV_REFERENCE_FRAME ref_frame_type) { + if (ref_frame_type >= REF_FRAMES) { + rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0]; + rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1]; + } else { + assert(ref_frame_type > NONE_FRAME); + rf[0] = ref_frame_type; + rf[1] = NONE_FRAME; + } +} + +static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = { + { 0, 1, 1, 1, 1 }, + { 1, 2, 3, 4, 4 }, + { 4, 4, 5, 6, 7 }, +}; + +static INLINE int16_t av1_mode_context_analyzer( + const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) { + const int8_t ref_frame = av1_ref_frame_type(rf); + + if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame]; + + const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK; + const int16_t refmv_ctx = + (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK; + + const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( + newmv_ctx, COMP_NEWMV_CTXS - 1)]; + return comp_ctx; +} + +static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) { + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL) + return 0; + + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) + return 1; + + if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) + return 2; + + return 0; +} + +void av1_setup_frame_buf_refs(AV1_COMMON *cm); +void av1_setup_frame_sign_bias(AV1_COMMON *cm); +void av1_setup_skip_mode_allowed(AV1_COMMON *cm); +void av1_calculate_ref_frame_side(AV1_COMMON *cm); +void av1_setup_motion_field(AV1_COMMON *cm); +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx); + +static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { + av1_zero(xd->neighbors_ref_counts); + + uint8_t *const ref_counts = xd->neighbors_ref_counts; + + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; + + // Above neighbor + if (above_in_image && is_inter_block(above_mbmi)) { + ref_counts[above_mbmi->ref_frame[0]]++; + if (has_second_ref(above_mbmi)) { + ref_counts[above_mbmi->ref_frame[1]]++; + } + } + + // Left neighbor + if (left_in_image && is_inter_block(left_mbmi)) { + ref_counts[left_mbmi->ref_frame[0]]++; + if (has_second_ref(left_mbmi)) { + ref_counts[left_mbmi->ref_frame[1]]++; + } + } +} + +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis); + +// The global_mvs output parameter points to an array of REF_FRAMES elements. +// The caller may pass a null global_mvs if it does not need the global_mvs +// output. +void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], + int_mv *global_mvs, int16_t *mode_context); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv, int is_integer); + +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize); +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref); + +#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels +#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) + +static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, + int mib_size, int mi_row) { + if (mi_row - mib_size < tile->mi_row_start) { + ref_dv->as_fullmv.row = 0; + ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS; + } else { + ref_dv->as_fullmv.row = -MI_SIZE * mib_size; + ref_dv->as_fullmv.col = 0; + } + convert_fullmv_to_mv(ref_dv); +} + +static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, + const MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int mib_size_log2) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int SCALE_PX_TO_MV = 8; + // Disallow subpixel for now + // SUBPEL_MASK is not the correct scale + if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1)))) + return 0; + + const TileInfo *const tile = &xd->tile; + // Is the source top-left inside the current tile? + const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row; + const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV; + if (src_top_edge < tile_top_edge) return 0; + const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col; + const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV; + if (src_left_edge < tile_left_edge) return 0; + // Is the bottom right inside the current tile? + const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row; + const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV; + if (src_bottom_edge > tile_bottom_edge) return 0; + const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col; + const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV; + if (src_right_edge > tile_right_edge) return 0; + + // Special case for sub 8x8 chroma cases, to prevent referring to chroma + // pixels outside current tile. + if (xd->is_chroma_ref && av1_num_planes(cm) > 1) { + const struct macroblockd_plane *const pd = &xd->plane[1]; + if (bw < 8 && pd->subsampling_x) + if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0; + if (bh < 8 && pd->subsampling_y) + if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0; + } + + // Is the bottom right within an already coded SB? Also consider additional + // constraints to facilitate HW decoder. + const int max_mib_size = 1 << mib_size_log2; + const int active_sb_row = mi_row >> mib_size_log2; + const int active_sb64_col = (mi_col * MI_SIZE) >> 6; + const int sb_size = max_mib_size * MI_SIZE; + const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size; + const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6; + const int total_sb64_per_row = + ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1; + const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col; + const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col; + if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0; + + // Wavefront constraint: use only top left area of frame for reference. + const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64); + const int wf_offset = gradient * (active_sb_row - src_sb_row); + if (src_sb_row > active_sb_row || + src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset) + return 0; + + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_MVREF_COMMON_H_ diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h new file mode 100644 index 0000000000..b84034541e --- /dev/null +++ b/third_party/aom/av1/common/obmc.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_OBMC_H_ +#define AOM_AV1_COMMON_OBMC_H_ + +typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, + void *fun_ctxt, const int num_planes); + +static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, + MACROBLOCKD *xd, int nb_max, + overlappable_nb_visitor_t fun, + void *fun_ctxt) { + if (!xd->up_available) return; + + const int num_planes = av1_num_planes(cm); + int nb_count = 0; + const int mi_col = xd->mi_col; + // prev_row_mi points into the mi array, starting at the beginning of the + // previous row. + MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; + const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols); + uint8_t mi_step; + for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; + above_mi_col += mi_step) { + MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; + mi_step = + AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]); + // If we're considering a block with width 4, it should be treated as + // half of a pair of blocks with chroma information in the second. Move + // above_mi_col back to the start of the pair if needed, set above_mbmi + // to point at the block with chroma information, and set mi_step to 2 to + // step over the entire pair at the end of the iteration. + if (mi_step == 1) { + above_mi_col &= ~1; + above_mi = prev_row_mi + above_mi_col + 1; + mi_step = 2; + } + if (is_neighbor_overlappable(*above_mi)) { + ++nb_count; + fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0, + *above_mi, fun_ctxt, num_planes); + } + } +} + +static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, + MACROBLOCKD *xd, int nb_max, + overlappable_nb_visitor_t fun, + void *fun_ctxt) { + if (!xd->left_available) return; + + const int num_planes = av1_num_planes(cm); + int nb_count = 0; + // prev_col_mi points into the mi array, starting at the top of the + // previous column + const int mi_row = xd->mi_row; + MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; + const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows); + uint8_t mi_step; + for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; + left_mi_row += mi_step) { + MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; + mi_step = + AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]); + if (mi_step == 1) { + left_mi_row &= ~1; + left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride; + mi_step = 2; + } + if (is_neighbor_overlappable(*left_mi)) { + ++nb_count; + fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi, + fun_ctxt, num_planes); + } + } +} + +#endif // AOM_AV1_COMMON_OBMC_H_ diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c new file mode 100644 index 0000000000..cfca03bb4d --- /dev/null +++ b/third_party/aom/av1/common/obu_util.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/common/obu_util.h" + +#include "aom_dsp/bitreader_buffer.h" + +static aom_codec_err_t read_obu_size(const uint8_t *data, + size_t bytes_available, + size_t *const obu_size, + size_t *const length_field_size) { + uint64_t u_obu_size = 0; + if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != + 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + + if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; + *obu_size = (size_t)u_obu_size; + return AOM_CODEC_OK; +} + +// Parses OBU header and stores values in 'header'. +static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, + int is_annexb, ObuHeader *header) { + if (!rb || !header) return AOM_CODEC_INVALID_PARAM; + + const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; + if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size = 1; + + if (aom_rb_read_bit(rb) != 0) { + // Forbidden bit. Must not be set. + return AOM_CODEC_CORRUPT_FRAME; + } + + header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); + header->has_extension = aom_rb_read_bit(rb); + header->has_size_field = aom_rb_read_bit(rb); + + if (!header->has_size_field && !is_annexb) { + // section 5 obu streams must have obu_size field set. + return AOM_CODEC_UNSUP_BITSTREAM; + } + + // obu_reserved_1bit must be set to 0. The value is ignored by a decoder. + aom_rb_read_bit(rb); + + if (header->has_extension) { + if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size += 1; + header->temporal_layer_id = aom_rb_read_literal(rb, 3); + header->spatial_layer_id = aom_rb_read_literal(rb, 2); + // extension_header_reserved_3bits must be set to 0. The value is ignored by + // a decoder. + aom_rb_read_literal(rb, 3); + } else { + header->temporal_layer_id = 0; + header->spatial_layer_id = 0; + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb) { + if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; + + // TODO(tomfinegan): Set the error handler here and throughout this file, and + // confirm parsing work done via aom_read_bit_buffer is successful. + struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, + NULL }; + aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); + if (parse_result == AOM_CODEC_OK) *consumed = header->size; + return parse_result; +} + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read) { + size_t length_field_size_obu = 0; + size_t length_field_size_payload = 0; + size_t obu_size = 0; + aom_codec_err_t status; + + if (is_annexb) { + // Size field comes before the OBU header, and includes the OBU header + status = + read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu); + + if (status != AOM_CODEC_OK) return status; + } + + struct aom_read_bit_buffer rb = { data + length_field_size_obu, + data + bytes_available, 0, NULL, NULL }; + + status = read_obu_header(&rb, is_annexb, obu_header); + if (status != AOM_CODEC_OK) return status; + + if (!obu_header->has_size_field) { + assert(is_annexb); + // Derive the payload size from the data we've already read + if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; + + *payload_size = obu_size - obu_header->size; + } else { + // Size field comes after the OBU header, and is just the payload size + status = read_obu_size( + data + length_field_size_obu + obu_header->size, + bytes_available - length_field_size_obu - obu_header->size, + payload_size, &length_field_size_payload); + if (status != AOM_CODEC_OK) return status; + } + + *bytes_read = + length_field_size_obu + obu_header->size + length_field_size_payload; + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h new file mode 100644 index 0000000000..adf3568e15 --- /dev/null +++ b/third_party/aom/av1/common/obu_util.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_OBU_UTIL_H_ +#define AOM_AV1_COMMON_OBU_UTIL_H_ + +#include "aom/aom_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t size; // Size (1 or 2 bytes) of the OBU header (including the + // optional OBU extension header) in the bitstream. + OBU_TYPE type; + int has_size_field; + int has_extension; // Whether the optional OBU extension header is present. + // The following fields come from the OBU extension header. They are set to 0 + // if has_extension is false. + int temporal_layer_id; + int spatial_layer_id; +} ObuHeader; + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb); + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_OBU_UTIL_H_ diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c new file mode 100644 index 0000000000..6f88768f2f --- /dev/null +++ b/third_party/aom/av1/common/ppc/cfl_ppc.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#define OFF_0 0 +#define OFF_1 16 +#define OFF_2 32 +#define OFF_3 48 +#define CFL_BUF_LINE_BYTES 64 +#define CFL_LINE_1 64 +#define CFL_LINE_2 128 +#define CFL_LINE_3 192 + +typedef vector signed char int8x16_t; // NOLINT(runtime/int) +typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) +typedef vector signed short int16x8_t; // NOLINT(runtime/int) +typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) +typedef vector signed int int32x4_t; // NOLINT(runtime/int) +typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) +typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) + +static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, + int width, int height, int round_offset, + int num_pel_log2) { + // int16_t *dst = dst_ptr; + const int16_t *dst_end = dst + height * CFL_BUF_LINE; + const int16_t *sum_buf = (const int16_t *)src_ptr; + const int16_t *end = sum_buf + height * CFL_BUF_LINE; + const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); + const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; + const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B }; + + int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset }; + int32x4_t sum_32x4_1 = { 0, 0, 0, 0 }; + do { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0); + sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1); + if (width >= 16) { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1); + } + if (width == 32) { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1); + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1); + } + } while ((sum_buf += (CFL_BUF_LINE * 2)) < end); + int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1); + + const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64); + sum_32x4 = vec_add(sum_32x4, perm_64); + const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32); + sum_32x4 = vec_add(sum_32x4, perm_32); + const int32x4_t avg = vec_sr(sum_32x4, div_shift); + const int16x8_t vec_avg = vec_pack(avg, avg); + do { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg), + OFF_0 + CFL_BUF_LINE_BYTES, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg), + OFF_0 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg), + OFF_0 + CFL_LINE_3, dst); + if (width >= 16) { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg), + OFF_1 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg), + OFF_1 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg), + OFF_1 + CFL_LINE_3, dst); + } + if (width == 32) { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg), + OFF_2 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg), + OFF_2 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg), + OFF_2 + CFL_LINE_3, dst); + + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg), + OFF_3 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg), + OFF_3 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg), + OFF_3 + CFL_LINE_3, dst); + } + } while ((dst += CFL_BUF_LINE * 4) < dst_end); +} + +// Declare wrappers for VSX sizes +CFL_SUB_AVG_X(vsx, 8, 4, 16, 5) +CFL_SUB_AVG_X(vsx, 8, 8, 32, 6) +CFL_SUB_AVG_X(vsx, 8, 16, 64, 7) +CFL_SUB_AVG_X(vsx, 8, 32, 128, 8) +CFL_SUB_AVG_X(vsx, 16, 4, 32, 6) +CFL_SUB_AVG_X(vsx, 16, 8, 64, 7) +CFL_SUB_AVG_X(vsx, 16, 16, 128, 8) +CFL_SUB_AVG_X(vsx, 16, 32, 256, 9) +CFL_SUB_AVG_X(vsx, 32, 8, 128, 8) +CFL_SUB_AVG_X(vsx, 32, 16, 256, 9) +CFL_SUB_AVG_X(vsx, 32, 32, 512, 10) + +// Based on observation, for small blocks VSX does not outperform C (no 64bit +// load and store intrinsics). So we call the C code for block widths 4. +cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { + cfl_subtract_average_4x4_c, /* 4x4 */ + cfl_subtract_average_8x8_vsx, /* 8x8 */ + cfl_subtract_average_16x16_vsx, /* 16x16 */ + cfl_subtract_average_32x32_vsx, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_c, /* 4x8 */ + cfl_subtract_average_8x4_vsx, /* 8x4 */ + cfl_subtract_average_8x16_vsx, /* 8x16 */ + cfl_subtract_average_16x8_vsx, /* 16x8 */ + cfl_subtract_average_16x32_vsx, /* 16x32 */ + cfl_subtract_average_32x16_vsx, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_c, /* 4x16 */ + cfl_subtract_average_16x4_vsx, /* 16x4 */ + cfl_subtract_average_8x32_vsx, /* 8x32 */ + cfl_subtract_average_32x8_vsx, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to + // index the function pointer array out of bounds. + return sub_avg[tx_size % TX_SIZES_ALL]; +} diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c new file mode 100644 index 0000000000..5952441d1f --- /dev/null +++ b/third_party/aom/av1/common/pred_common.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +// Returns a context number for the given MB prediction signal +static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi, + const MACROBLOCKD *xd, int dir, + MV_REFERENCE_FRAME ref_frame) { + (void)xd; + + return ((ref_mbmi->ref_frame[0] == ref_frame || + ref_mbmi->ref_frame[1] == ref_frame) + ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01) + : SWITCHABLE_FILTERS); +} + +int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx_offset = + (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET; + assert(dir == 0 || dir == 1); + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET; + int left_type = SWITCHABLE_FILTERS; + int above_type = SWITCHABLE_FILTERS; + + if (xd->left_available) + left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame); + + if (xd->up_available) + above_type = + get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame); + + if (left_type == above_type) { + filter_type_ctx += left_type; + } else if (left_type == SWITCHABLE_FILTERS) { + assert(above_type != SWITCHABLE_FILTERS); + filter_type_ctx += above_type; + } else if (above_type == SWITCHABLE_FILTERS) { + assert(left_type != SWITCHABLE_FILTERS); + filter_type_ctx += left_type; + } else { + filter_type_ctx += SWITCHABLE_FILTERS; + } + + return filter_type_ctx; +} + +static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) { + // Do not add an already existing value + if (*n > 0 && val == cache[*n - 1]) return; + + cache[(*n)++] = val; +} + +int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, + uint16_t *cache) { + const int row = -xd->mb_to_top_edge >> 3; + // Do not refer to above SB row when on SB boundary. + const MB_MODE_INFO *const above_mi = + (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int above_n = 0, left_n = 0; + if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0]; + if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0]; + if (above_n == 0 && left_n == 0) return 0; + int above_idx = plane * PALETTE_MAX_SIZE; + int left_idx = plane * PALETTE_MAX_SIZE; + int n = 0; + const uint16_t *above_colors = + above_mi ? above_mi->palette_mode_info.palette_colors : NULL; + const uint16_t *left_colors = + left_mi ? left_mi->palette_mode_info.palette_colors : NULL; + // Merge the sorted lists of base colors from above and left to get + // combined sorted color cache. + while (above_n > 0 && left_n > 0) { + uint16_t v_above = above_colors[above_idx]; + uint16_t v_left = left_colors[left_idx]; + if (v_left < v_above) { + palette_add_to_cache(cache, &n, v_left); + ++left_idx, --left_n; + } else { + palette_add_to_cache(cache, &n, v_above); + ++above_idx, --above_n; + if (v_left == v_above) ++left_idx, --left_n; + } + } + while (above_n-- > 0) { + uint16_t val = above_colors[above_idx++]; + palette_add_to_cache(cache, &n, val); + } + while (left_n-- > 0) { + uint16_t val = left_colors[left_idx++]; + palette_add_to_cache(cache, &n, val); + } + assert(n <= 2 * PALETTE_MAX_SIZE); + return n; +} + +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +int av1_get_intra_inter_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + return left_intra && above_intra ? 3 : left_intra || above_intra; + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi); + } else { + return 0; + } +} + +#define CHECK_BACKWARD_REFS(ref_frame) \ + (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME)) +#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame) + +int av1_get_reference_mode_context(const MACROBLOCKD *xd) { + int ctx; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) + // neither edge uses comp pred (0/1) + ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^ + IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]); + else if (!has_second_ref(above_mbmi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) || + !is_inter_block(above_mbmi)); + else if (!has_second_ref(left_mbmi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) || + !is_inter_block(left_mbmi)); + else // both edges use comp pred (4) + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; + + if (!has_second_ref(edge_mbmi)) + // edge does not use comp pred (0/1) + ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]); + else + // edge uses comp pred (3) + ctx = 3; + } else { // no edges available (1) + ctx = 1; + } + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; +} + +int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) { + int pred_context; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; + + if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter + const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi; + + if (!has_second_ref(inter_mbmi)) // single pred + pred_context = 2; + else // comp pred + pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi); + } else { // inter/inter + const int a_sg = !has_second_ref(above_mbmi); + const int l_sg = !has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0]; + + if (a_sg && l_sg) { // single/single + pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^ + IS_BACKWARD_REF_FRAME(frfl))); + } else if (l_sg || a_sg) { // single/comp + const int uni_rfc = + a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi); + + if (!uni_rfc) // comp bidir + pred_context = 1; + else // comp unidir + pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^ + IS_BACKWARD_REF_FRAME(frfl))); + } else { // comp/comp + const int a_uni_rfc = has_uni_comp_refs(above_mbmi); + const int l_uni_rfc = has_uni_comp_refs(left_mbmi); + + if (!a_uni_rfc && !l_uni_rfc) // bidir/bidir + pred_context = 0; + else if (!a_uni_rfc || !l_uni_rfc) // unidir/bidir + pred_context = 2; + else // unidir/unidir + pred_context = + 3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME))); + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (!is_inter_block(edge_mbmi)) { // intra + pred_context = 2; + } else { // inter + if (!has_second_ref(edge_mbmi)) // single pred + pred_context = 2; + else // comp pred + pred_context = 4 * has_uni_comp_refs(edge_mbmi); + } + } else { // no edges available + pred_context = 2; + } + + assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as either +// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN), +// conditioning on the pair is known as uni-directional. +// +// 3 contexts: Voting is used to compare the count of forward references with +// that of backward references from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of forward references (L, L2, L3, or G) + const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + // Count of backward references (B or A) + const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + + ref_counts[ALTREF_FRAME]; + + const int pred_context = + (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as +// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN), +// conditioning on the pair is known as one of the above three. +// +// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the +// total count of LAST3/GOLDEN from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST2 + const int last2_count = ref_counts[LAST2_FRAME]; + // Count of LAST3 or GOLDEN + const int last3_or_gld_count = + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + + const int pred_context = (last2_count == last3_or_gld_count) + ? 1 + : ((last2_count < last3_or_gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as +// either (LAST, LAST3) or (LAST, GOLDEN), +// conditioning on the pair is known as one of the above two. +// +// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the +// total count of GOLDEN_FRAME from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST3 + const int last3_count = ref_counts[LAST3_FRAME]; + // Count of GOLDEN + const int gld_count = ref_counts[GOLDEN_FRAME]; + + const int pred_context = + (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// == Common context functions for both comp and single ref == +// +// Obtain contexts to signal a reference frame to be either LAST/LAST2 or +// LAST3/GOLDEN. +static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST + LAST2 + const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME]; + // Count of LAST3 + GOLDEN + const int last3_gld_count = + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + + const int pred_context = (last_last2_count == last3_gld_count) + ? 1 + : ((last_last2_count < last3_gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame to be either LAST or LAST2. +static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST + const int last_count = ref_counts[LAST_FRAME]; + // Count of LAST2 + const int last2_count = ref_counts[LAST2_FRAME]; + + const int pred_context = + (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN. +static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST3 + const int last3_count = ref_counts[LAST3_FRAME]; + // Count of GOLDEN + const int gld_count = ref_counts[GOLDEN_FRAME]; + + const int pred_context = + (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or +// ALTREF. +static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A) + const int brfarf2_count = + ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME]; + const int arf_count = ref_counts[ALTREF_FRAME]; + + const int pred_context = + (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2. +static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of BWDREF frames (B) + const int brf_count = ref_counts[BWDREF_FRAME]; + // Count of ALTREF2 frames (A2) + const int arf2_count = ref_counts[ALTREF2_FRAME]; + + const int pred_context = + (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// == Context functions for comp ref == +// +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be either +// GOLDEN/LAST3, or LAST/LAST2. +int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) { + return get_pred_context_ll2_or_l3gld(xd); +} + +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be LAST, +// conditioning on that it is known either LAST/LAST2. +int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) { + return get_pred_context_last_or_last2(xd); +} + +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be GOLDEN, +// conditioning on that it is known either GOLDEN or LAST3. +int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) { + return get_pred_context_last3_or_gld(xd); +} + +// Signal the 2nd reference frame for a compound mode be either +// ALTREF, or ALTREF2/BWDREF. +int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) { + return get_pred_context_brfarf2_or_arf(xd); +} + +// Signal the 2nd reference frame for a compound mode be either +// ALTREF2 or BWDREF. +int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) { + return get_pred_context_brf_or_arf2(xd); +} + +// == Context functions for single ref == +// +// For the bit to signal whether the single reference is a forward reference +// frame or a backward reference frame. +int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of forward reference frames + const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + // Count of backward reference frames + const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + + ref_counts[ALTREF_FRAME]; + + const int pred_context = + (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// For the bit to signal whether the single reference is ALTREF_FRAME or +// non-ALTREF backward reference frame, knowing that it shall be either of +// these 2 choices. +int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { + return get_pred_context_brfarf2_or_arf(xd); +} + +// For the bit to signal whether the single reference is LAST3/GOLDEN or +// LAST2/LAST, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) { + return get_pred_context_ll2_or_l3gld(xd); +} + +// For the bit to signal whether the single reference is LAST2_FRAME or +// LAST_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) { + return get_pred_context_last_or_last2(xd); +} + +// For the bit to signal whether the single reference is GOLDEN_FRAME or +// LAST3_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) { + return get_pred_context_last3_or_gld(xd); +} + +// For the bit to signal whether the single reference is ALTREF2_FRAME or +// BWDREF_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) { + return get_pred_context_brf_or_arf2(xd); +} diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h new file mode 100644 index 0000000000..361a4078d4 --- /dev/null +++ b/third_party/aom/av1/common/pred_common.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_PRED_COMMON_H_ +#define AOM_AV1_COMMON_PRED_COMMON_H_ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE uint8_t get_segment_id( + const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + const int seg_stride = mi_params->mi_cols; + uint8_t segment_id = MAX_SEGMENTS; + + for (int y = 0; y < ymis; ++y) { + for (int x = 0; x < xmis; ++x) { + segment_id = + AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]); + } + } + + assert(segment_id < MAX_SEGMENTS); + return segment_id; +} + +static INLINE uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + int *cdf_index, + int skip_over4x4) { + const int step_size = skip_over4x4 ? 2 : 1; + uint8_t prev_ul = UINT8_MAX; // top left segment_id + uint8_t prev_l = UINT8_MAX; // left segment_id + uint8_t prev_u = UINT8_MAX; // top segment_id + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const uint8_t *seg_map = cm->cur_frame->seg_map; + if ((xd->up_available) && (xd->left_available)) { + prev_ul = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size, + mi_col - step_size); + } + if (xd->up_available) { + prev_u = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size, + mi_col - 0); + } + if (xd->left_available) { + prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, + mi_col - step_size); + } + assert(IMPLIES(prev_ul != UINT8_MAX, + prev_u != UINT8_MAX && prev_l != UINT8_MAX)); + + // Pick CDF index based on number of matching/out-of-bounds segment IDs. + if (prev_ul == UINT8_MAX) /* Edge cases */ + *cdf_index = 0; + else if ((prev_ul == prev_u) && (prev_ul == prev_l)) + *cdf_index = 2; + else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l)) + *cdf_index = 1; + else + *cdf_index = 0; + + // If 2 or more are identical returns that as predictor, otherwise prev_l. + if (prev_u == UINT8_MAX) // edge case + return prev_l == UINT8_MAX ? 0 : prev_l; + if (prev_l == UINT8_MAX) // edge case + return prev_u; + return (prev_ul == prev_u) ? prev_u : prev_l; +} + +static INLINE uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0; + const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0; + + return above_sip + left_sip; +} + +static INLINE int get_comp_index_context(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + int bck_frame_index = 0, fwd_frame_index = 0; + int cur_frame_index = cm->cur_frame->order_hint; + + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; + + int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info, + fwd_frame_index, cur_frame_index)); + int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, bck_frame_index)); + + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + + int above_ctx = 0, left_ctx = 0; + const int offset = (fwd == bck); + + if (above_mi != NULL) { + if (has_second_ref(above_mi)) + above_ctx = above_mi->compound_idx; + else if (above_mi->ref_frame[0] == ALTREF_FRAME) + above_ctx = 1; + } + + if (left_mi != NULL) { + if (has_second_ref(left_mi)) + left_ctx = left_mi->compound_idx; + else if (left_mi->ref_frame[0] == ALTREF_FRAME) + left_ctx = 1; + } + + return above_ctx + left_ctx + 3 * offset; +} + +static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int above_ctx = 0, left_ctx = 0; + + if (above_mi) { + if (has_second_ref(above_mi)) + above_ctx = above_mi->comp_group_idx; + else if (above_mi->ref_frame[0] == ALTREF_FRAME) + above_ctx = 3; + } + if (left_mi) { + if (has_second_ref(left_mi)) + left_ctx = left_mi->comp_group_idx; + else if (left_mi->ref_frame[0] == ALTREF_FRAME) + left_ctx = 3; + } + + return AOMMIN(5, above_ctx + left_ctx); +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id( + struct segmentation_probs *segp, const MACROBLOCKD *xd) { + return segp->pred_cdf[av1_get_pred_context_seg_id(xd)]; +} + +static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_skip_mode = above_mi ? above_mi->skip_mode : 0; + const int left_skip_mode = left_mi ? left_mi->skip_mode : 0; + return above_skip_mode + left_skip_mode; +} + +static INLINE int av1_get_skip_txfm_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0; + const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0; + return above_skip_txfm + left_skip_txfm; +} + +int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir); + +// Get a list of palette base colors that are used in the above and left blocks, +// referred to as "color cache". The return value is the number of colors in the +// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache" +// in ascending order. +int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, + uint16_t *cache); + +static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8]; +} + +static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int ctx = 0; + if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0); + if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0); + return ctx; +} + +int av1_get_intra_inter_context(const MACROBLOCKD *xd); + +int av1_get_reference_mode_context(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)]; +} + +static INLINE aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)]; +} + +int av1_get_comp_reference_type_context(const MACROBLOCKD *xd); + +// == Uni-directional contexts == + +int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_comp_reference_type_context(xd); + return xd->tile_ctx->comp_ref_type_cdf[pred_context]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2]; +} + +// == Bi-directional contexts == + +int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p1(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][1]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p2(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][2]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_bwdref_p(xd); + return xd->tile_ctx->comp_bwdref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd); + return xd->tile_ctx->comp_bwdref_cdf[pred_context][1]; +} + +// == Single contexts == + +int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5]; +} + +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize]; + const int max_tx_wide = tx_size_wide[max_tx_size]; + const int max_tx_high = tx_size_high[max_tx_size]; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + int above = xd->above_txfm_context[0] >= max_tx_wide; + int left = xd->left_txfm_context[0] >= max_tx_high; + + if (has_above) + if (is_inter_block(above_mbmi)) + above = block_size_wide[above_mbmi->bsize] >= max_tx_wide; + + if (has_left) + if (is_inter_block(left_mbmi)) + left = block_size_high[left_mbmi->bsize] >= max_tx_high; + + if (has_above && has_left) + return (above + left); + else if (has_above) + return above; + else if (has_left) + return left; + else + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_PRED_COMMON_H_ diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c new file mode 100644 index 0000000000..b0976287ef --- /dev/null +++ b/third_party/aom/av1/common/quant_common.c @@ -0,0 +1,12876 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = { + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, + 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, + 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, + 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, + 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, + 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, + 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, + 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, + 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, + 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, + 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, + 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, + 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, + 1184, 1232, 1282, 1336, +}; + +static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = { + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, + 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, + 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, + 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, + 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, + 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, + 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, + 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, + 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, + 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, + 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, + 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, + 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, + 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, + 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, + 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, + 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, + 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, +}; + +static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = { + 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, + 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, + 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, + 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, + 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, + 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, + 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, + 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, + 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, + 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, + 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, + 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, + 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, + 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, + 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, + 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, + 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, + 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, + 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, + 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, + 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, + 19718, 20521, 21387, +}; + +static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = { + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, + 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, + 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, + 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, + 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, + 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, + 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, + 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, + 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, + 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, + 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, +}; + +static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = { + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, + 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, + 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, + 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, + 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, + 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, + 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, + 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, + 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, + 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, + 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, + 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, + 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, + 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, + 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, + 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, + 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, + 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, +}; + +static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = { + 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, + 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, + 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, + 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, + 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, + 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, + 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, + 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, + 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, + 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, + 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, + 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, + 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, + 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, + 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, + 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, + 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, + 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, + 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, + 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, + 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, + 28143, 28687, 29247, +}; + +// Coefficient scaling and quantization with AV1 TX are tailored to +// the AV1 TX transforms. Regardless of the bit-depth of the input, +// the transform stages scale the coefficient values up by a factor of +// 8 (3 bits) over the scale of the pixel values. Thus, for 8-bit +// input, the coefficients have effectively 11 bits of scale depth +// (8+3), 10-bit input pixels result in 13-bit coefficient depth +// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth. +// All quantizers are built using this invariant of x8, 3-bit scaling, +// thus the Q3 suffix. + +// A partial exception to this rule is large transforms; to avoid +// overflow, TX blocks with > 256 pels (>16x16) are scaled only +// 4-times unity (2 bits) over the pixel depth, and TX blocks with +// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit). +// This descaling is found via av1_tx_get_scale(). Thus, 16x32, 32x16 +// and 32x32 transforms actually return Q2 coefficients, and 32x64, +// 64x32 and 64x64 transforms return Q1 coefficients. However, the +// quantizers are de-scaled down on-the-fly by the same amount +// (av1_tx_get_scale()) during quantization, and as such the +// dequantized/decoded coefficients, even for large TX blocks, are always +// effectively Q3. Meanwhile, quantized/coded coefficients are Q0 +// because Qn quantizers are applied to Qn tx coefficients. + +// Note that encoder decision making (which uses the quantizer to +// generate several bespoke lamdas for RDO and other heuristics) +// expects quantizers to be larger for higher-bitdepth input. In +// addition, the minimum allowable quantizer is 4; smaller values will +// underflow to 0 in the actual quantization routines. + +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); + switch (bit_depth) { + case AOM_BITS_8: return dc_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped]; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); + switch (bit_depth) { + case AOM_BITS_8: return ac_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped]; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +int av1_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + const int seg_qindex = base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); + } else { + return base_qindex; + } +} + +bool av1_use_qmatrix(const CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id) { + // True if explicit Q matrix levels and this is not a lossless segment. + return quant_params->using_qmatrix && !xd->lossless[segment_id]; +} + +const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->giqmatrix[qmlevel][plane][tx_size]; +} +const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->gqmatrix[qmlevel][plane][tx_size]; +} + +// Returns true if the tx_type corresponds to non-identity transform in both +// horizontal and vertical directions. +static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); } + +const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_iqmatrix[seg_id][qm_tx_size] + : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_qmatrix[seg_id][qm_tx_size] + : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +#define QM_TOTAL_SIZE 3344 +// We only use wt_matrix_ref[q] and iwt_matrix_ref[q] +// for q = 0, ..., NUM_QM_LEVELS - 2. +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; + +void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { + for (int q = 0; q < NUM_QM_LEVELS; ++q) { + for (int c = 0; c < num_planes; ++c) { + int current = 0; + for (int t = 0; t < TX_SIZES_ALL; ++t) { + const int size = tx_size_2d[t]; + const int qm_tx_size = av1_get_adjusted_tx_size(t); + if (q == NUM_QM_LEVELS - 1) { + quant_params->gqmatrix[q][c][t] = NULL; + quant_params->giqmatrix[q][c][t] = NULL; + } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size' + assert(t > qm_tx_size); + quant_params->gqmatrix[q][c][t] = + quant_params->gqmatrix[q][c][qm_tx_size]; + quant_params->giqmatrix[q][c][t] = + quant_params->giqmatrix[q][c][qm_tx_size]; + } else { + assert(current + size <= QM_TOTAL_SIZE); + quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current]; + quant_params->giqmatrix[q][c][t] = + &iwt_matrix_ref[q][c >= 1][current]; + current += size; + } + } + } + } +} + +/* Provide 15 sets of quantization matrices for chroma and luma + and each TX size. Matrices for different TX sizes are in fact + sub-sampled from the 32x32 and 16x16 sizes, but explicitly + defined here for convenience. Intra and inter matrix sets are the + same but changing DEFAULT_QM_INTER_OFFSET from zero allows + for different matrices for inter and intra blocks in the same + frame. + Matrices for different QM levels have been rescaled in the + frequency domain according to different nominal viewing + distances. Matrices for QM level 15 are omitted because they are + not used. + */ +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { + { + { /* Luma */ + /* Size 4x4 */ + 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200, + /* Size 8x8 */ + 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38, + 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63, + 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89, + 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220, + /* Size 16x16 */ + 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31, + 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32, + 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35, + 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48, + 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63, + 67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71, + 80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92, + 98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98, + 105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110, + 118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113, + 121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107, + 115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100, + 108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96, + 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100, + 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119, + 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220, + 231, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71, + 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32, + 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77, + 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32, + 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83, + 86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35, + 35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91, + 94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42, + 44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100, + 104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49, + 53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106, + 109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63, + 68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34, + 34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77, + 79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36, + 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88, + 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47, + 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98, + 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, + 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102, + 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67, + 69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110, + 111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71, + 76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117, + 119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, + 87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120, + 121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, + 92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125, + 129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, + 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131, + 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, + 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133, + 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, + 97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137, + 140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, + 96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142, + 144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, + 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148, + 149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81, + 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156, + 152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82, + 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154, + 158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81, + 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, + 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84, + 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, + 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90, + 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, + 148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101, + 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, + 138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197, + 204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, + 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200, + 204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, + 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, + 193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102, + 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, + 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104, + 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, + 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119, + 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, + 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, + 222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, + 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, + 204, 206, 222, 224, 230, 232, 242, + /* Size 4x8 */ + 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75, + 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190, + /* Size 8x4 */ + 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65, + 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190, + /* Size 8x16 */ + 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32, + 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34, + 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50, + 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59, + 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77, + 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86, + 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99, + 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203, + /* Size 16x8 */ + 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34, + 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, + 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102, + 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124, + 122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121, + 144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107, + 128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, + 117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, + 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32, + 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, + 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34, + 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, + 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41, + 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, + 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, + 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, + 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, + 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, + 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, + 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, + 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, + 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, + 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, + 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, + 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, + 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, + 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, + 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, + 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, + 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, + 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84, + 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, + 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, + 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, + 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, + 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, + 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93, + 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, + 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217, + /* Size 32x16 */ + 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32, + 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33, + 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, + 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54, + 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69, + 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, + 80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84, + 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90, + 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41, + 43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, + 55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56, + 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60, + 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63, + 75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66, + 77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68, + 79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, + 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, + 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75, + 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72, + 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80, + 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91, + 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155, + 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166, + 168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173, + 171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173, + 178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170, + 174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155, + 168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138, + 141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117, + 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105, + 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111, + 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217, + /* Size 4x16 */ + 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44, + 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72, + 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90, + 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197, + /* Size 16x4 */ + 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54, + 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118, + 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, + 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, + 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32, + 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, + 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36, + 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, + 91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57, + 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, + 107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65, + 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, + 128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82, + 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, + 153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80, + 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, + 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89, + 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, + 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, + /* Size 32x8 */ + 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33, + 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50, + 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, + 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90, + 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44, + 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60, + 82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, + 89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, + 97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80, + 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77, + 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94, + 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171, + 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174, + 186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138, + 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105, + 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204 }, + { /* Chroma */ + /* Size 4x4 */ + 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109, + /* Size 8x8 */ + 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46, + 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72, + 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95, + 104, 107, 71, 67, 68, 75, 84, 95, 107, 113, + /* Size 16x16 */ + 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32, + 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45, + 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49, + 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55, + 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67, + 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74, + 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78, + 78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83, + 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58, + 57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59, + 64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69, + 73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73, + 78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78, + 82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83, + 88, 93, 98, 104, 109, 112, 116, + /* Size 32x32 */ + 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60, + 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33, + 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44, + 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66, + 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46, + 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50, + 51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38, + 40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55, + 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47, + 48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59, + 60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51, + 51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63, + 63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54, + 55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66, + 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, + 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45, + 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69, + 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51, + 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70, + 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, + 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74, + 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69, + 70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52, + 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, + 79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49, + 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80, + 81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, + 63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82, + 82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, + 75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, + 89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57, + 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92, + 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60, + 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94, + 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, + 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95, + 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, + 84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65, + 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, + 96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63, + 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, + 99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62, + 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, + 101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61, + 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, + 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63, + 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, + 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66, + 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, + 108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, + 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, + 110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, + 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, + 112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, + 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, + 115, 115, 118, + /* Size 4x8 */ + 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54, + 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105, + /* Size 8x4 */ + 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65, + 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105, + /* Size 8x16 */ + 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40, + 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50, + 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61, + 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73, + 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, + 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, + 99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, + 104, 106, 109, + /* Size 16x8 */ + 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43, + 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54, + 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73, + 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, + 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, + 83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, + 91, 101, 109, + /* Size 16x32 */ + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, + 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34, + 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, + 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47, + 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, + 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, + 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, + 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, + 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47, + 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, + 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, + 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, + 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, + 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, + 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, + 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, + 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59, + 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, + 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61, + 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, + 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, + 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, + 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, + 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, + 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, + 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, + 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, + 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113, + /* Size 32x16 */ + 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31, + 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42, + 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45, + 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49, + 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, + 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, + 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, + 61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62, + 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45, + 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49, + 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59, + 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68, + 71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, + 80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, + 79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, + 79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, + 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58, + 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58, + 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69, + 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78, + 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, + 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, + 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, + 102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, + 103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, + 106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, + 107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109, + 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77, + 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113, + /* Size 4x16 */ + 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45, + 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57, + 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66, + 70, 74, 80, 85, 91, 96, 101, 103, 105, 107, + /* Size 16x4 */ + 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53, + 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80, + 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68, + 89, 103, 67, 70, 86, 105, 69, 72, 88, 107, + /* Size 8x32 */ + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, + 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41, + 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, + 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51, + 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, + 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, + 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, + 73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, + 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63, + 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, + 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60, + 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, + 99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61, + 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, + 104, 106, 106, 109, 109, 108, + /* Size 32x8 */ + 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40, + 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47, + 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60, + 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62, + 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, + 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, + 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, + 79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56, + 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74, + 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96, + 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101, + 104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106, + 75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77, + 70, 67, 73, 81, 90, 99, 108 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184, + /* Size 8x8 */ + 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39, + 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71, + 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93, + 106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201, + /* Size 16x16 */ + 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31, + 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33, + 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40, + 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56, + 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70, + 78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, + 95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97, + 104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106, + 113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117, + 125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125, + 134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119, + 131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110, + 122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104, + 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102, + 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98, + 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65, + 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32, + 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76, + 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32, + 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82, + 85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35, + 36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89, + 92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42, + 45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97, + 100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, + 53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32, + 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65, + 68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35, + 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79, + 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48, + 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90, + 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, + 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100, + 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, + 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106, + 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79, + 84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45, + 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, + 95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46, + 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98, + 100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49, + 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, + 106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54, + 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110, + 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56, + 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113, + 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59, + 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118, + 121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65, + 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122, + 125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70, + 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, + 127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76, + 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, + 130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78, + 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, + 128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162, + 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, + 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165, + 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, + 120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169, + 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, + 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174, + 174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, + 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176, + 180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, + 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, + 176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, + 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, + 173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90, + 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, + 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96, + 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, + 155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105, + 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, + 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211, + 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, + 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, + 204, 210, 211, 219, + /* Size 4x8 */ + 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64, + 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177, + /* Size 8x4 */ + 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79, + 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177, + /* Size 8x16 */ + 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32, + 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36, + 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56, + 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73, + 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84, + 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91, + 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87, + 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188, + /* Size 16x8 */ + 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34, + 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, + 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95, + 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113, + 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133, + 142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, + 148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, + 129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, + 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32, + 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, + 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35, + 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40, + 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, + 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, + 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, + 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, + 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, + 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, + 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49, + 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, + 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59, + 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, + 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66, + 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, + 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72, + 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, + 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83, + 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, + 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166, + 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, + 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, + 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, + 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, + 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, + 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, + 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, + 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, + 186, 192, 193, 201, + /* Size 32x16 */ + 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32, + 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33, + 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, + 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50, + 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64, + 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, + 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81, + 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87, + 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37, + 40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, + 53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66, + 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71, + 77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82, + 92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, + 98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, + 100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92, + 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97, + 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101, + 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104, + 118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106, + 121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100, + 109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, + 112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, + 101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, + 95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, + 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91, + 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95, + 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107, + 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192, + 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188, + 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166, + 189, 190, 201, + /* Size 4x16 */ + 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41, + 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65, + 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83, + 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183, + /* Size 16x4 */ + 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54, + 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107, + 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, + 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, + 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32, + 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, + 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38, + 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, + 89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, + 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, + 110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, + 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, + 128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, + 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, + 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, + 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, + 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, + 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, + 171, 174, 179, 181, 188, 188, 190, + /* Size 32x8 */ + 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32, + 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45, + 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, + 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87, + 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42, + 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57, + 71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, + 98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84, + 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92, + 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97, + 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99, + 124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92, + 105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91, + 94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107, + 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, + 114, 104, 100, 111, 127, 145, 166, 190 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105, + /* Size 8x8 */ + 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47, + 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67, + 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92, + 101, 103, 69, 65, 66, 73, 82, 92, 103, 109, + /* Size 16x16 */ + 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31, + 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44, + 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48, + 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54, + 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61, + 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, + 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76, + 76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80, + 60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58, + 56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58, + 62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67, + 71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71, + 76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76, + 80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80, + 85, 90, 95, 100, 105, 108, 111, + /* Size 32x32 */ + 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32, + 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61, + 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42, + 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64, + 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45, + 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66, + 67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, + 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38, + 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, + 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47, + 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58, + 58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, + 49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, + 62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, + 54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58, + 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45, + 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64, + 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49, + 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68, + 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72, + 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66, + 66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50, + 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, + 75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48, + 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78, + 79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, + 59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80, + 80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, + 70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83, + 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, + 82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56, + 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88, + 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57, + 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91, + 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, + 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92, + 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, + 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64, + 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, + 93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59, + 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, + 99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62, + 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, + 100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, + 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, + 102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, + 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104, + 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, + 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106, + 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, + 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108, + 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, + 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75, + 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, + 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113, + /* Size 4x8 */ + 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52, + 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102, + /* Size 8x4 */ + 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64, + 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102, + /* Size 8x16 */ + 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38, + 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48, + 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56, + 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71, + 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, + 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, + 97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, + 101, 103, 105, + /* Size 16x8 */ + 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41, + 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54, + 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70, + 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78, + 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, + 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, + 81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, + 89, 98, 105, + /* Size 16x32 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33, + 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46, + 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, + 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, + 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, + 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47, + 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, + 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, + 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, + 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, + 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, + 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, + 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, + 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57, + 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, + 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60, + 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, + 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, + 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, + 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, + 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, + 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, + 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71, + 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, + 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109, + /* Size 32x16 */ + 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31, + 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40, + 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45, + 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47, + 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, + 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, + 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, + 60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61, + 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46, + 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47, + 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59, + 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64, + 68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, + 75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, + 78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, + 78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, + 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56, + 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56, + 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68, + 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76, + 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, + 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, + 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, + 100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, + 99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104, + 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73, + 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67, + 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68, + 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109, + /* Size 4x16 */ + 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45, + 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54, + 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65, + 68, 73, 78, 84, 89, 93, 98, 100, 102, 103, + /* Size 16x4 */ + 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53, + 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78, + 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66, + 87, 100, 65, 68, 83, 102, 67, 70, 86, 103, + /* Size 8x32 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38, + 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, + 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50, + 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, + 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, + 71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, + 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60, + 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, + 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59, + 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, + 97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61, + 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, + 101, 103, 103, 105, 105, 105, + /* Size 32x8 */ + 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38, + 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46, + 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55, + 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61, + 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, + 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, + 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, + 78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79, + 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55, + 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72, + 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94, + 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98, + 101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103, + 73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75, + 68, 65, 71, 78, 87, 96, 105 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169, + /* Size 8x8 */ + 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37, + 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64, + 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87, + 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184, + /* Size 16x16 */ + 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32, + 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34, + 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40, + 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53, + 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, + 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91, + 96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103, + 107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, + 118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125, + 125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133, + 134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143, + 145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150, + 156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156, + 163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148, + 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141, + 152, 163, 177, 184, 191, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59, + 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32, + 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68, + 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81, + 83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37, + 38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89, + 91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46, + 50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32, + 32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58, + 62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34, + 34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71, + 72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41, + 42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82, + 83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50, + 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92, + 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63, + 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38, + 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, + 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41, + 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91, + 93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, + 58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99, + 102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, + 70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105, + 105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, + 73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112, + 112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, + 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113, + 116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, + 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121, + 121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, + 87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122, + 125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, + 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130, + 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, + 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134, + 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, + 92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136, + 139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, + 98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140, + 144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, + 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148, + 148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, + 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153, + 153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, + 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151, + 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, + 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, + 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85, + 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, + 156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83, + 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, + 153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86, + 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, + 148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91, + 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, + 144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94, + 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, + 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101, + 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, + 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199, + /* Size 4x8 */ + 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58, + 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165, + /* Size 8x4 */ + 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69, + 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165, + /* Size 8x16 */ + 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32, + 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38, + 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58, + 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74, + 81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90, + 97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97, + 104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97, + 105, 113, 122, 131, 141, 151, 163, 169, 175, + /* Size 16x8 */ + 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33, + 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, + 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90, + 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111, + 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135, + 131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, + 151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141, + 160, 169, 103, 94, 92, 103, 119, 137, 158, 175, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, + 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, + 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34, + 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, + 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, + 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, + 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, + 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39, + 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, + 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41, + 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, + 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, + 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, + 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, + 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, + 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63, + 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, + 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73, + 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, + 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75, + 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, + 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, + 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, + 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, + 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, + 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84, + 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, + 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87, + 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, + 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187, + /* Size 32x16 */ + 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32, + 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32, + 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, + 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50, + 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58, + 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, + 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79, + 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84, + 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37, + 39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, + 51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58, + 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75, + 79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81, + 86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, + 103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, + 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98, + 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105, + 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109, + 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111, + 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, + 133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, + 135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121, + 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125, + 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129, + 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117, + 128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105, + 120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103, + 108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98, + 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92, + 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, + 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187, + /* Size 4x16 */ + 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38, + 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58, + 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78, + 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170, + /* Size 16x4 */ + 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47, + 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97, + 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, + 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, + 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32, + 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, + 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38, + 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, + 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, + 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, + 103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, + 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, + 121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, + 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, + 141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, + 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, + 155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, + 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, + 168, 169, 175, 175, 176, + /* Size 32x8 */ + 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32, + 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42, + 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, + 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84, + 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40, + 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, + 79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, + 103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91, + 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100, + 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109, + 133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111, + 136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113, + 135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, + 120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, + 103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, + 93, 104, 118, 135, 155, 176 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101, + /* Size 8x8 */ + 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47, + 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65, + 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89, + 97, 99, 67, 63, 64, 71, 79, 89, 99, 104, + /* Size 16x16 */ + 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31, + 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43, + 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46, + 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51, + 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60, + 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68, + 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74, + 73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77, + 57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55, + 53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55, + 59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65, + 69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74, + 79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77, + 82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82, + 87, 92, 96, 101, 104, 106, + /* Size 32x32 */ + 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55, + 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31, + 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60, + 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42, + 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63, + 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45, + 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64, + 65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47, + 49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36, + 37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, + 53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45, + 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55, + 57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50, + 49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59, + 60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51, + 51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62, + 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, + 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45, + 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63, + 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49, + 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67, + 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, + 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70, + 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63, + 64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48, + 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69, + 69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47, + 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, + 77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, + 58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78, + 77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, + 66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, + 76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53, + 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84, + 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55, + 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88, + 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, + 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89, + 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, + 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61, + 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, + 86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58, + 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, + 95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, + 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98, + 99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, + 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101, + 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, + 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67, + 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, + 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64, + 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, + 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64, + 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, + 96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62, + 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, + 99, 104, 104, 106, 106, 108, + /* Size 4x8 */ + 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50, + 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99, + /* Size 8x4 */ + 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59, + 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99, + /* Size 8x16 */ + 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36, + 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47, + 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56, + 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65, + 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, + 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, + 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, + 100, 102, + /* Size 16x8 */ + 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40, + 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51, + 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67, + 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76, + 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, + 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, + 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, + 95, 102, + /* Size 16x32 */ + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, + 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32, + 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45, + 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, + 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, + 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, + 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, + 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47, + 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, + 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, + 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, + 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, + 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, + 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, + 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, + 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57, + 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, + 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58, + 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, + 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, + 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, + 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, + 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69, + 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, + 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63, + 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, + 87, 91, 91, 95, 96, 101, 101, 103, 103, 105, + /* Size 32x16 */ + 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31, + 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39, + 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45, + 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47, + 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, + 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, + 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, + 59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60, + 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46, + 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46, + 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56, + 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63, + 65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, + 71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, + 76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, + 76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, + 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54, + 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52, + 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64, + 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75, + 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, + 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, + 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, + 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96, + 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70, + 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65, + 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65, + 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63, + 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105, + /* Size 4x16 */ + 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46, + 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50, + 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64, + 67, 71, 77, 82, 87, 91, 95, 97, 99, 100, + /* Size 16x4 */ + 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51, + 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77, + 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64, + 84, 97, 64, 66, 81, 99, 65, 68, 83, 100, + /* Size 8x32 */ + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, + 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36, + 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, + 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50, + 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, + 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, + 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, + 69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, + 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60, + 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, + 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58, + 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, + 94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, + 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, + 100, 102, 102, 101, + /* Size 32x8 */ + 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36, + 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46, + 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54, + 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61, + 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, + 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, + 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, + 76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77, + 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53, + 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68, + 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91, + 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98, + 69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71, + 64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, + 63, 69, 76, 84, 93, 101 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156, + /* Size 8x8 */ + 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36, + 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73, + 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92, + 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168, + /* Size 16x16 */ + 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32, + 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33, + 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38, + 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50, + 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64, + 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83, + 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100, + 102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110, + 109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117, + 117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125, + 126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135, + 137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, + 148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150, + 155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155, + 162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162, + 168, 174, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56, + 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68, + 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77, + 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, + 38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86, + 88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, + 49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32, + 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58, + 59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34, + 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69, + 71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, + 40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78, + 80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, + 50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60, + 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34, + 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73, + 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42, + 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86, + 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, + 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97, + 97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67, + 71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47, + 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, + 83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44, + 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90, + 93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50, + 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, + 104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51, + 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106, + 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55, + 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112, + 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62, + 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118, + 119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63, + 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120, + 121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68, + 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127, + 128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70, + 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131, + 131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74, + 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136, + 137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76, + 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, + 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76, + 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, + 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80, + 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, + 144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78, + 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, + 147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80, + 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, + 142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83, + 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, + 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86, + 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, + 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90, + 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, + 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181, + /* Size 4x8 */ + 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57, + 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154, + /* Size 8x4 */ + 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61, + 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154, + /* Size 8x16 */ + 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32, + 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37, + 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54, + 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75, + 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92, + 98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106, + 112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108, + 116, 124, 134, 142, 153, 157, 163, + /* Size 16x8 */ + 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33, + 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48, + 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76, + 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, + 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, + 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, + 142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, + 157, 97, 88, 86, 97, 111, 128, 147, 163, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, + 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, + 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34, + 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, + 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, + 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, + 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, + 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35, + 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, + 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, + 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, + 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, + 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, + 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, + 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, + 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, + 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, + 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, + 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, + 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, + 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, + 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, + 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, + 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, + 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, + 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, + 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, + 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, + 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, + 161, 162, 166, 167, 173, + /* Size 32x16 */ + 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32, + 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32, + 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, + 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44, + 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57, + 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, + 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76, + 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80, + 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34, + 36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, + 48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51, + 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66, + 71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, + 84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, + 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106, + 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108, + 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112, + 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118, + 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, + 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, + 129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131, + 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136, + 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139, + 145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139, + 148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144, + 148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, + 153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, + 153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151, + 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147, + 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144, + 163, 163, 173, + /* Size 4x16 */ + 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35, + 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56, + 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76, + 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159, + /* Size 16x4 */ + 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42, + 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, + 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145, + 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, + 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, + 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, + 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39, + 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, + 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, + 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, + 97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, + 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, + 110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, + 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, + 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, + 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, + 144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, + 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, + 163, 163, + /* Size 32x8 */ + 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32, + 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42, + 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, + 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81, + 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39, + 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, + 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85, + 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108, + 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118, + 119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127, + 129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136, + 139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, + 150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, + 157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151, + 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144, + 163 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97, + /* Size 8x8 */ + 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47, + 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62, + 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86, + 91, 95, 65, 61, 62, 68, 76, 86, 95, 100, + /* Size 16x16 */ + 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31, + 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42, + 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48, + 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50, + 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55, + 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64, + 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72, + 71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75, + 54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53, + 51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53, + 56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61, + 66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72, + 77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79, + 83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88, + 93, 97, 100, 102, + /* Size 32x32 */ + 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31, + 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57, + 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40, + 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60, + 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45, + 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62, + 63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, + 47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34, + 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, + 51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43, + 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54, + 55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, + 49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58, + 58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, + 50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54, + 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46, + 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, + 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48, + 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64, + 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, + 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68, + 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60, + 61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48, + 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66, + 66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46, + 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71, + 71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, + 54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75, + 75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, + 64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78, + 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, + 71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51, + 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78, + 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51, + 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83, + 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, + 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86, + 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, + 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60, + 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, + 82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57, + 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, + 91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, + 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94, + 95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, + 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95, + 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, + 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62, + 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, + 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60, + 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, + 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, + 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, + 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, + 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, + 101, 104, + /* Size 4x8 */ + 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50, + 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95, + /* Size 8x4 */ + 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55, + 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95, + /* Size 8x16 */ + 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34, + 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47, + 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53, + 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63, + 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, + 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, + 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, + 97, 98, + /* Size 16x8 */ + 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37, + 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49, + 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61, + 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, + 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, + 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, + 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, + 92, 98, + /* Size 16x32 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32, + 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, + 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43, + 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, + 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, + 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, + 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47, + 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, + 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, + 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, + 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, + 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, + 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, + 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, + 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, + 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52, + 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, + 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55, + 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, + 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, + 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, + 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, + 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64, + 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, + 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60, + 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, + 89, 93, 93, 97, 98, 99, 99, 102, + /* Size 32x16 */ + 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31, + 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39, + 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46, + 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46, + 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50, + 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54, + 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57, + 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59, + 47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46, + 47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, + 51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, + 57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, + 61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, + 66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, + 71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, + 74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, + 54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51, + 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50, + 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58, + 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69, + 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80, + 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90, + 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94, + 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93, + 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62, + 61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59, + 61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, + 67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, + 74, 82, 82, 90, 90, 98, 98, 102, + /* Size 4x16 */ + 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46, + 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49, + 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63, + 66, 70, 75, 80, 85, 89, 93, 94, 96, 97, + /* Size 16x4 */ + 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50, + 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75, + 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63, + 82, 94, 62, 64, 79, 96, 63, 66, 81, 97, + /* Size 8x32 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35, + 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48, + 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, + 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, + 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, + 67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, + 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54, + 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, + 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57, + 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, + 91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, + 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, + 97, 99, 98, 98, + /* Size 32x8 */ + 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34, + 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45, + 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50, + 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60, + 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, + 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, + 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, + 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75, + 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51, + 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66, + 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83, + 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95, + 67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, + 61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, + 74, 82, 90, 98 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140, + /* Size 8x8 */ + 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35, + 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65, + 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86, + 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153, + /* Size 16x16 */ + 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32, + 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33, + 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37, + 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46, + 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60, + 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74, + 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91, + 95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101, + 104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112, + 59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66, + 63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70, + 67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77, + 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81, + 77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86, + 82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52, + 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62, + 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75, + 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80, + 82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, + 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32, + 32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53, + 54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33, + 34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63, + 66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37, + 38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72, + 74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, + 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81, + 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53, + 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34, + 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68, + 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38, + 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79, + 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, + 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91, + 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61, + 63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42, + 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75, + 75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44, + 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87, + 89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50, + 51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98, + 99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, + 64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106, + 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, + 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, + 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, + 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115, + 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, + 85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65, + 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, + 97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63, + 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, + 99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67, + 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, + 103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70, + 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, + 106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75, + 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, + 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77, + 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, + 112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78, + 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, + 113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81, + 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, + 116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84, + 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, + 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86, + 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, + 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88, + 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, + 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164, + /* Size 4x8 */ + 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50, + 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144, + /* Size 8x4 */ + 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59, + 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144, + /* Size 8x16 */ + 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32, + 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36, + 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51, + 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69, + 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85, + 92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96, + 103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103, + 111, 118, 126, 134, 143, 147, 151, + /* Size 16x8 */ + 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, + 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44, + 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73, + 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, + 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, + 60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81, + 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91, + 82, 80, 90, 103, 119, 137, 151, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, + 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, + 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, + 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, + 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, + 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, + 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35, + 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, + 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, + 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, + 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, + 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, + 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, + 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, + 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, + 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, + 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, + 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, + 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, + 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79, + 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, + 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, + 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, + 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, + 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, + 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, + 155, 160, + /* Size 32x16 */ + 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32, + 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32, + 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, + 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41, + 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50, + 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, + 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70, + 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78, + 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34, + 35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, + 42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50, + 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59, + 65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, + 79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, + 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98, + 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105, + 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, + 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, + 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60, + 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61, + 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65, + 65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68, + 67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, + 72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, + 73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75, + 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78, + 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81, + 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83, + 82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85, + 85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160, + /* Size 4x16 */ + 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35, + 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49, + 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74, + 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, + /* Size 16x4 */ + 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41, + 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, + 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74, + 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, + 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, + 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36, + 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, + 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, + 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, + 90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, + 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, + 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, + 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79, + 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, + 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, + 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, + 152, + /* Size 32x8 */ + 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32, + 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38, + 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, + 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78, + 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35, + 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, + 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82, + 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101, + 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113, + 58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66, + 60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, + 67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, + 73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86, + 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91, + 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91, + /* Size 8x8 */ + 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45, + 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58, + 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82, + 89, 92, 64, 59, 60, 66, 74, 83, 92, 96, + /* Size 16x16 */ + 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31, + 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39, + 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47, + 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48, + 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54, + 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, + 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68, + 69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72, + 52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51, + 49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51, + 53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59, + 62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68, + 71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76, + 80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85, + 89, 94, 96, 98, + /* Size 32x32 */ + 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31, + 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54, + 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39, + 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46, + 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61, + 61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46, + 46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34, + 35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49, + 49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41, + 44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52, + 53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47, + 48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55, + 56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49, + 48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59, + 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, + 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46, + 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, + 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47, + 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61, + 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, + 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66, + 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, + 57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47, + 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62, + 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46, + 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67, + 68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, + 54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71, + 72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, + 61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75, + 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, + 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49, + 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73, + 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48, + 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78, + 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, + 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83, + 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, + 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57, + 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, + 75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54, + 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, + 84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, + 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90, + 91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, + 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91, + 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, + 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60, + 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, + 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57, + 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, + 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, + 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, + 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, + 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99, + /* Size 4x8 */ + 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47, + 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93, + /* Size 8x4 */ + 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54, + 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93, + /* Size 8x16 */ + 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33, + 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46, + 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53, + 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61, + 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, + 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, + 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, + 94, 95, + /* Size 16x8 */ + 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35, + 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49, + 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60, + 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, + 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, + 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, + 89, 95, + /* Size 16x32 */ + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32, + 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, + 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41, + 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, + 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, + 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, + 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, + 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47, + 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, + 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, + 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, + 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, + 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, + 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, + 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, + 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, + 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52, + 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, + 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50, + 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, + 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, + 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, + 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, + 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63, + 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, + 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60, + 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, + 86, 90, 90, 95, 95, 96, 96, 98, + /* Size 32x16 */ + 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31, + 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38, + 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46, + 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45, + 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47, + 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51, + 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55, + 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58, + 42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46, + 47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, + 50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, + 54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, + 58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, + 64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, + 69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, + 73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, + 52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50, + 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49, + 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56, + 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64, + 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72, + 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81, + 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90, + 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90, + 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60, + 59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58, + 59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, + 65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, + 71, 79, 79, 87, 87, 95, 95, 98, + /* Size 4x16 */ + 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47, + 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47, + 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61, + 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, + /* Size 16x4 */ + 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49, + 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71, + 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61, + 75, 90, 60, 62, 76, 92, 62, 64, 78, 94, + /* Size 8x32 */ + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33, + 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, + 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47, + 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, + 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, + 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, + 65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, + 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54, + 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, + 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56, + 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, + 86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, + 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, + 94, 96, 95, 95, + /* Size 32x8 */ + 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33, + 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46, + 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50, + 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59, + 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, + 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, + 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, + 69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50, + 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61, + 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80, + 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92, + 64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, + 59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, + 71, 79, 87, 95 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134, + /* Size 8x8 */ + 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35, + 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61, + 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90, + 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32, + 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32, + 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35, + 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40, + 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51, + 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64, + 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78, + 84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, + 48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51, + 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54, + 54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59, + 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63, + 68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74, + 79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76, + 81, 86, 92, 99, 106, 113, 121, 128, 137, 140, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, + 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68, + 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, + 75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, + 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, + 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, + 59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, + 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, + 69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, + 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, + 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33, + 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58, + 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38, + 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, + 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81, + 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, + 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38, + 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, + 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42, + 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, + 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, + 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, + 90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, + 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, + 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, + 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49, + 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, + 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50, + 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, + 97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, + 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, + 110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, + 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, + 113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, + 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, + 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, + 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, + 124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, + 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, + 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, + 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80, + 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, + 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76, + 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, + 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78, + 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, + 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78, + 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, + 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83, + 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, + 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, + /* Size 4x8 */ + 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50, + 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136, + /* Size 8x4 */ + 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56, + 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136, + /* Size 8x16 */ + 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32, + 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34, + 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42, + 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58, + 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, + 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, + 98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, + 110, 118, 125, 133, 136, + /* Size 16x8 */ + 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, + 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, + 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, + 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, + 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, + 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, + 92, 106, 121, 136, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, + 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, + 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, + 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, + 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, + 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, + 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, + 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, + 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, + 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, + 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, + 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, + 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, + 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51, + 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, + 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, + 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, + 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, + 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, + 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, + 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, + 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, + 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, + 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, + 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, + 132, 132, 141, 141, 144, 144, 149, + /* Size 32x16 */ + 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32, + 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32, + 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, + 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41, + 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, + 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, + 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, + 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75, + 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34, + 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, + 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48, + 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58, + 58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, + 65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, + 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, + 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, + 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, + 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49, + 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54, + 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54, + 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, + 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, + 79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84, + 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84, + 97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90, + 104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90, + 104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, + 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, + 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96, + 109, 109, 124, 124, 141, 141, 149, + /* Size 4x16 */ + 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35, + 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50, + 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69, + 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, + /* Size 16x4 */ + 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38, + 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90, + 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65, + 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, + 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, + 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, + 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, + 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51, + 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, + 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, + 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, + 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, + 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, + 118, 125, 125, 133, 133, 136, 136, 141, + /* Size 32x8 */ + 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, + 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34, + 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50, + 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69, + 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, + 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, + 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, + 79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49, + 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54, + 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68, + 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84, + 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90, + 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92, + 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89, + /* Size 8x8 */ + 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45, + 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56, + 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75, + 82, 86, 61, 57, 58, 64, 71, 79, 86, 91, + /* Size 16x16 */ + 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31, + 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35, + 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45, + 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46, + 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50, + 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55, + 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61, + 63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68, + 50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50, + 47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49, + 48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53, + 56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61, + 65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71, + 75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79, + 83, 86, 90, 91, + /* Size 32x32 */ + 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31, + 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, + 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38, + 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57, + 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, + 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, + 58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, + 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34, + 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, + 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39, + 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, + 51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, + 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, + 54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, + 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, + 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42, + 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52, + 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48, + 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, + 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, + 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, + 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, + 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47, + 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, + 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45, + 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, + 63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, + 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, + 67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, + 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, + 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, + 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47, + 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, + 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47, + 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72, + 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, + 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, + 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, + 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54, + 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, + 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, + 76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, + 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, + 85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, + 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, + 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, + 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, + 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55, + 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, + 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, + 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, + 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, + 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, + /* Size 4x8 */ + 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47, + 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90, + /* Size 8x4 */ + 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54, + 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90, + /* Size 8x16 */ + 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31, + 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43, + 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50, + 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56, + 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, + 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, + 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, + 89, 90, + /* Size 16x8 */ + 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, + 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, + 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, + 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, + 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, + 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, + 83, 90, + /* Size 16x32 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, + 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40, + 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, + 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, + 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, + 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, + 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, + 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, + 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, + 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, + 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, + 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, + 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, + 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, + 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, + 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48, + 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, + 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, + 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, + 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, + 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, + 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, + 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, + 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, + 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59, + 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, + 84, 88, 88, 92, 92, 93, 93, 95, + /* Size 32x16 */ + 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31, + 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38, + 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46, + 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45, + 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, + 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51, + 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, + 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57, + 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43, + 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, + 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, + 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, + 56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, + 57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, + 64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, + 67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, + 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48, + 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47, + 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55, + 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62, + 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, + 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76, + 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, + 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88, + 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57, + 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56, + 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, + 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, + 69, 77, 77, 84, 84, 92, 92, 95, + /* Size 4x16 */ + 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47, + 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47, + 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57, + 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, + /* Size 16x4 */ + 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47, + 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67, + 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58, + 72, 85, 57, 60, 75, 89, 59, 61, 75, 90, + /* Size 8x32 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, + 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, + 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, + 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, + 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, + 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, + 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50, + 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, + 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, + 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, + 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, + 89, 90, 90, 92, + /* Size 32x8 */ + 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31, + 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46, + 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47, + 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54, + 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, + 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, + 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, + 64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48, + 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55, + 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70, + 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85, + 63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, + 56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, + 69, 77, 84, 92 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108, + /* Size 8x8 */ + 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33, + 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54, + 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89, + 98, 108, 69, 65, 64, 73, 85, 97, 108, 119, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32, + 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32, + 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35, + 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39, + 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, + 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59, + 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71, + 76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86, + 45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46, + 44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50, + 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58, + 62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67, + 71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76, + 80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86, + 91, 96, 104, 110, 118, 125, 134, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, + 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59, + 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, + 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37, + 38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45, + 45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52, + 54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60, + 64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38, + 39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69, + 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43, + 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, + 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, + 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35, + 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60, + 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, + 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72, + 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50, + 52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37, + 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59, + 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38, + 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67, + 69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, + 46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77, + 80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, + 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, + 65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45, + 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, + 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45, + 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83, + 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, + 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97, + 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, + 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, + 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, + 75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56, + 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, + 86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56, + 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, + 95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59, + 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, + 105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, + 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, + 108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, + 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, + 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, + 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, + 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, + 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, + 134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, + 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, + 134, 134, + /* Size 4x8 */ + 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42, + 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111, + /* Size 8x4 */ + 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48, + 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111, + /* Size 8x16 */ + 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32, + 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34, + 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42, + 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56, + 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, + 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, + 95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, + 112, 119, 127, + /* Size 16x8 */ + 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, + 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38, + 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, + 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, + 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, + 79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, + 104, 115, 127, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, + 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, + 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, + 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, + 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, + 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, + 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, + 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, + 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, + 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, + 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, + 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, + 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, + 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50, + 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, + 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50, + 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, + 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, + 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, + 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, + 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, + 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, + 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, + 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, + 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, + 133, 133, + /* Size 32x16 */ + 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32, + 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32, + 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33, + 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36, + 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, + 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, + 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, + 65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69, + 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34, + 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36, + 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42, + 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49, + 54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, + 63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, + 71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, + 81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, + 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45, + 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46, + 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54, + 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63, + 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, + 85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, + 87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, + 98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, + 105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, + 114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, + 125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34, + 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43, + 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63, + 67, 71, 76, 81, 85, 92, 98, 105, 111, 118, + /* Size 16x4 */ + 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37, + 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76, + 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63, + 80, 105, 66, 68, 85, 111, 73, 74, 91, 118, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, + 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34, + 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, + 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, + 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, + 79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, + 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51, + 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, + 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, + 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, + 94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65, + 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, + 105, 108, 112, 114, 119, 119, 127, 127, + /* Size 32x8 */ + 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32, + 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34, + 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, + 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64, + 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, + 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, + 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, + 71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45, + 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60, + 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87, + 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102, + 112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119, + 72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79, + 72, 70, 79, 90, 104, 115, 127 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78, + /* Size 8x8 */ + 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42, + 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53, + 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69, + 73, 77, 57, 54, 52, 58, 65, 72, 77, 82, + /* Size 16x16 */ + 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31, + 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35, + 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45, + 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47, + 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49, + 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53, + 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58, + 60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65, + 49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48, + 46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47, + 47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52, + 55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58, + 61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66, + 68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75, + 78, 82, 85, 89, + /* Size 32x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49, + 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31, + 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50, + 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35, + 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53, + 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42, + 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, + 58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45, + 45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32, + 33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46, + 46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37, + 39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, + 49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44, + 45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51, + 53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48, + 47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54, + 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45, + 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42, + 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, + 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45, + 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, + 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, + 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58, + 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53, + 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47, + 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, + 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46, + 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, + 58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, + 49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, + 63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, + 55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, + 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47, + 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, + 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46, + 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66, + 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, + 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72, + 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, + 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51, + 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, + 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49, + 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, + 71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, + 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75, + 77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, + 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82, + 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, + 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55, + 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, + 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53, + 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, + 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, + 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, + 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, + 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, + /* Size 4x8 */ + 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45, + 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79, + /* Size 8x4 */ + 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49, + 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79, + /* Size 8x16 */ + 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31, + 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43, + 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50, + 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55, + 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, + 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, + 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, + 82, 86, + /* Size 16x8 */ + 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, + 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47, + 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, + 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, + 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, + 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, + 80, 86, + /* Size 16x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, + 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, + 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, + 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37, + 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, + 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, + 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, + 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, + 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42, + 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, + 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, + 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, + 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, + 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, + 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, + 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, + 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48, + 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, + 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47, + 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, + 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, + 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, + 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, + 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58, + 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, + 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57, + 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, + 78, 80, 82, 83, 85, 85, 89, 89, + /* Size 32x16 */ + 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31, + 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36, + 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42, + 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46, + 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45, + 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47, + 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51, + 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54, + 37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43, + 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, + 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, + 52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, + 53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, + 56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, + 59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, + 63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, + 49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47, + 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46, + 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50, + 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56, + 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62, + 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68, + 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75, + 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82, + 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55, + 54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53, + 53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, + 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, + 67, 68, 75, 75, 80, 82, 86, 89, + /* Size 4x16 */ + 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42, + 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46, + 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53, + 56, 58, 61, 64, 67, 71, 73, 76, 79, 82, + /* Size 16x4 */ + 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47, + 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61, + 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53, + 64, 76, 55, 55, 66, 79, 58, 58, 68, 82, + /* Size 8x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, + 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, + 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, + 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44, + 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, + 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, + 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, + 60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, + 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50, + 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, + 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, + 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, + 72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, + 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, + 82, 83, 86, 86, + /* Size 32x8 */ + 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31, + 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46, + 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47, + 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52, + 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, + 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, + 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, + 59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46, + 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54, + 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68, + 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79, + 58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, + 53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, + 67, 75, 80, 86 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92, + /* Size 8x8 */ + 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32, + 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47, + 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75, + 82, 92, 63, 59, 58, 65, 73, 84, 92, 105, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32, + 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32, + 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33, + 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37, + 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41, + 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51, + 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63, + 65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, + 41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42, + 41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45, + 45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49, + 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63, + 67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73, + 77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85, + 92, 97, 101, 105, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46, + 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51, + 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61, + 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, + 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, + 49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54, + 56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, + 37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, + 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45, + 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35, + 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53, + 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, + 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63, + 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45, + 46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35, + 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, + 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34, + 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, + 60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, + 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, + 69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, + 50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77, + 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, + 57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42, + 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, + 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42, + 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, + 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, + 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84, + 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, + 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49, + 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, + 68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49, + 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, + 82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, + 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, + 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, + 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, + 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, + 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57, + 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, + 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59, + 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, + 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, + 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, + 101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, + 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, + 109, 114, + /* Size 4x8 */ + 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41, + 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97, + /* Size 8x4 */ + 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40, + 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97, + /* Size 8x16 */ + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32, + 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34, + 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37, + 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50, + 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, + 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, + 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, + 100, 105, + /* Size 16x8 */ + 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, + 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, + 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48, + 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, + 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, + 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, + 92, 105, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, + 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, + 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, + 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, + 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, + 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32, + 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, + 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, + 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, + 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, + 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, + 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, + 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, + 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41, + 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, + 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48, + 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, + 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, + 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, + 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, + 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63, + 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, + 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59, + 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, + 88, 92, 92, 97, 98, 100, 105, 105, 109, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32, + 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32, + 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32, + 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34, + 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, + 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, + 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, + 59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58, + 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32, + 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35, + 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37, + 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44, + 46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, + 54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, + 60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, + 72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, + 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41, + 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42, + 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45, + 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56, + 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, + 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, + 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, + 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97, + 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57, + 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60, + 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59, + 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62, + 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32, + 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42, + 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52, + 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, + /* Size 16x4 */ + 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34, + 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67, + 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53, + 74, 90, 57, 56, 77, 93, 61, 58, 79, 97, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, + 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34, + 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, + 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, + 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, + 66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, + 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42, + 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, + 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49, + 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, + 82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, + 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, + 100, 105, 105, 109, + /* Size 32x8 */ + 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32, + 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34, + 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, + 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58, + 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, + 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, + 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, + 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41, + 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54, + 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71, + 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, + 58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, + 58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, + 70, 76, 83, 96, 109 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71, + /* Size 8x8 */ + 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40, + 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51, + 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63, + 66, 70, 55, 52, 50, 54, 60, 66, 70, 76, + /* Size 16x16 */ + 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31, + 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34, + 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42, + 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47, + 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46, + 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50, + 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55, + 56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, + 49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47, + 45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46, + 46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47, + 50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55, + 58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60, + 63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67, + 70, 73, 74, 76, + /* Size 32x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48, + 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, + 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48, + 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34, + 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50, + 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41, + 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, + 53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, + 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31, + 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, + 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34, + 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, + 47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, + 43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, + 50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, + 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39, + 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46, + 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44, + 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49, + 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, + 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53, + 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48, + 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, + 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46, + 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, + 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, + 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, + 58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, + 53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, + 55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46, + 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, + 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45, + 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, + 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, + 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66, + 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, + 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49, + 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, + 60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, + 66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, + 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, + 69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, + 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, + 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, + 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52, + 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, + 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, + 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, + 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, + 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, + 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, + 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80, + /* Size 4x8 */ + 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45, + 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73, + /* Size 8x4 */ + 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47, + 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73, + /* Size 8x16 */ + 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31, + 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42, + 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47, + 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53, + 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, + 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, + 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, + 74, 76, + /* Size 16x8 */ + 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32, + 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46, + 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49, + 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, + 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, + 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, + 70, 76, + /* Size 16x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, + 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, + 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, + 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36, + 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, + 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, + 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, + 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, + 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38, + 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, + 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, + 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, + 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, + 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, + 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, + 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, + 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, + 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46, + 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, + 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47, + 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, + 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, + 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, + 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, + 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55, + 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, + 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, + 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, + 71, 71, 73, 73, 74, 76, 76, 78, + /* Size 32x16 */ + 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31, + 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34, + 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39, + 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46, + 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45, + 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47, + 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49, + 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51, + 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39, + 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, + 47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, + 49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, + 51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, + 53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, + 54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, + 59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, + 48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46, + 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45, + 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46, + 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54, + 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60, + 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, + 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68, + 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, + 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51, + 51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51, + 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, + 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, + 61, 65, 65, 70, 72, 74, 78, 78, + /* Size 4x16 */ + 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38, + 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45, + 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48, + 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, + /* Size 16x4 */ + 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44, + 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58, + 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48, + 62, 70, 51, 49, 63, 71, 53, 50, 64, 73, + /* Size 8x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, + 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32, + 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, + 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43, + 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, + 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, + 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, + 55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, + 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47, + 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, + 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48, + 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, + 66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, + 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, + 74, 76, 76, 78, + /* Size 32x8 */ + 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31, + 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44, + 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, + 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51, + 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, + 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, + 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, + 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45, + 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52, + 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61, + 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, + 54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, + 50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, + 61, 65, 72, 78 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81, + /* Size 8x8 */ + 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32, + 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42, + 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65, + 71, 77, 53, 50, 51, 55, 61, 70, 77, 85, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32, + 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32, + 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33, + 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35, + 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, + 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51, + 54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64, + 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38, + 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41, + 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45, + 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51, + 57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, + 65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, + 79, 81, 87, 92, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41, + 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46, + 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52, + 56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, + 37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41, + 44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49, + 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, + 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46, + 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52, + 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, + 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34, + 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, + 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34, + 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51, + 53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, + 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, + 60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, + 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, + 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38, + 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, + 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39, + 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61, + 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, + 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70, + 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, + 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43, + 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, + 58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44, + 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, + 69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, + 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75, + 76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, + 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81, + 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, + 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51, + 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, + 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51, + 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, + 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, + 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, + 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, + 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, + /* Size 4x8 */ + 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36, + 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83, + /* Size 8x4 */ + 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38, + 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83, + /* Size 8x16 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32, + 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33, + 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36, + 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42, + 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, + 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, + 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, + 82, 87, + /* Size 16x8 */ + 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, + 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34, + 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, + 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, + 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, + 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, + 79, 87, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, + 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, + 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, + 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, + 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, + 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, + 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, + 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, + 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, + 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, + 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, + 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42, + 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, + 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42, + 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, + 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, + 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, + 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, + 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52, + 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, + 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54, + 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, + 79, 80, 81, 86, 87, 88, 92, 92, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32, + 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32, + 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, + 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34, + 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, + 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41, + 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49, + 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54, + 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32, + 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, + 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, + 36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, + 42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, + 48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, + 53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, + 60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, + 38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38, + 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38, + 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42, + 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52, + 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56, + 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66, + 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76, + 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81, + 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51, + 49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51, + 51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, + 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, + 63, 67, 75, 75, 79, 87, 87, 92, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32, + 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36, + 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49, + 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, + /* Size 16x4 */ + 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34, + 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60, + 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47, + 60, 77, 51, 50, 63, 82, 55, 54, 67, 87, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, + 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, + 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, + 55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, + 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43, + 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, + 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, + 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, + 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, + 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, + 82, 83, 87, 87, + /* Size 32x8 */ + 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32, + 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, + 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41, + 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50, + 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, + 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, + 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, + 53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37, + 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45, + 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66, + 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77, + 53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, + 51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, + 63, 75, 79, 87 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66, + /* Size 8x8 */ + 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36, + 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50, + 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59, + 61, 64, 51, 48, 48, 51, 54, 60, 64, 68, + /* Size 16x16 */ + 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31, + 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32, + 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40, + 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45, + 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47, + 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47, + 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50, + 50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47, + 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45, + 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, + 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50, + 54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, + 57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, + 64, 66, 68, 71, + /* Size 32x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49, + 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47, + 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34, + 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38, + 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50, + 52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44, + 46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31, + 31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45, + 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32, + 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45, + 46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40, + 43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, + 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45, + 45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, + 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47, + 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38, + 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, + 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43, + 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46, + 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, + 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48, + 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, + 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42, + 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, + 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44, + 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, + 52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, + 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, + 54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, + 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47, + 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, + 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46, + 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, + 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, + 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60, + 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, + 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47, + 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, + 56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, + 61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, + 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63, + 63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, + 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66, + 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, + 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49, + 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, + 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48, + 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, + 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, + 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, + 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, + 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, + /* Size 4x8 */ + 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46, + 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67, + /* Size 8x4 */ + 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48, + 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67, + /* Size 8x16 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31, + 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38, + 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47, + 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50, + 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, + 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, + 66, 68, + /* Size 16x8 */ + 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, + 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44, + 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, + 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, + 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, + 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, + 65, 68, + /* Size 16x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, + 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, + 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35, + 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, + 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, + 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, + 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, + 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38, + 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, + 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, + 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, + 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, + 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, + 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, + 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47, + 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, + 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45, + 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, + 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, + 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, + 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, + 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50, + 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, + 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50, + 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, + 65, 65, 66, 68, 68, 69, 71, 71, + /* Size 32x16 */ + 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31, + 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32, + 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38, + 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45, + 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46, + 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45, + 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47, + 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49, + 34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39, + 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, + 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, + 47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, + 50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, + 49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, + 52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, + 54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, + 48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46, + 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46, + 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46, + 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52, + 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55, + 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, + 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64, + 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66, + 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49, + 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48, + 48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, + 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, + 55, 57, 62, 62, 65, 68, 68, 71, + /* Size 4x16 */ + 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38, + 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46, + 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47, + 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, + /* Size 16x4 */ + 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43, + 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54, + 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47, + 55, 64, 49, 47, 56, 66, 51, 49, 57, 68, + /* Size 8x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, + 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, + 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, + 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, + 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, + 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, + 50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, + 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48, + 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, + 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, + 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, + 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, + 66, 67, 68, 68, + /* Size 32x8 */ + 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31, + 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40, + 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, + 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47, + 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, + 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, + 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, + 52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46, + 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47, + 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59, + 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64, + 52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, + 48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, + 55, 62, 65, 68 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65, + /* Size 8x8 */ + 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32, + 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37, + 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56, + 63, 67, 47, 44, 45, 46, 52, 59, 67, 71, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, + 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34, + 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, + 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40, + 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44, + 47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35, + 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37, + 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40, + 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44, + 47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, + 56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, + 63, 67, 70, 71, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, + 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, + 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, + 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, + 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, + 38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42, + 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, + 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38, + 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42, + 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, + 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, + 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, + 40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33, + 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, + 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, + 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, + 50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54, + 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, + 46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35, + 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, + 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35, + 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54, + 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, + 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59, + 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, + 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39, + 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, + 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, + 57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, + 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, + 66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, + 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, + 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, + 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45, + 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, + 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45, + 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, + 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, + 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, + 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, + 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77, + /* Size 4x8 */ + 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34, + 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67, + /* Size 8x4 */ + 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37, + 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32, + 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34, + 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39, + 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, + 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, + 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, + 69, 70, + /* Size 16x8 */ + 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, + 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34, + 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38, + 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, + 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, + 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, + 67, 70, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, + 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, + 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, + 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, + 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, + 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, + 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, + 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, + 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35, + 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, + 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38, + 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, + 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, + 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, + 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, + 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, + 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, + 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49, + 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, + 67, 71, 71, 72, 75, 76, 76, 79, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, + 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38, + 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41, + 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49, + 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, + 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, + 33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, + 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, + 37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, + 40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, + 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, + 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, + 35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35, + 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34, + 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40, + 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42, + 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50, + 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56, + 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, + 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71, + 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45, + 44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45, + 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, + 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, + 56, 58, 58, 64, 69, 69, 73, 79, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32, + 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34, + 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, + 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, + /* Size 16x4 */ + 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34, + 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48, + 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43, + 53, 63, 45, 45, 56, 66, 46, 46, 56, 67, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, + 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, + 46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, + 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35, + 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, + 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41, + 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, + 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, + 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, + 69, 70, 70, 73, + /* Size 32x8 */ + 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32, + 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, + 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34, + 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44, + 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, + 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, + 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, + 48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34, + 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40, + 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, + 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66, + 44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, + 45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, + 56, 58, 69, 73 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59, + /* Size 8x8 */ + 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35, + 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47, + 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55, + 58, 60, 49, 46, 46, 46, 50, 55, 60, 61, + /* Size 16x16 */ + 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31, + 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31, + 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35, + 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43, + 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47, + 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46, + 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47, + 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47, + 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45, + 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46, + 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47, + 49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, + 54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, + 58, 60, 61, 61, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43, + 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48, + 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, + 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, + 42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31, + 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, + 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32, + 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, + 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, + 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, + 46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, + 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, + 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34, + 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46, + 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38, + 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45, + 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, + 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, + 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, + 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39, + 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, + 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, + 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, + 49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, + 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, + 50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, + 48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47, + 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46, + 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, + 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, + 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, + 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47, + 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, + 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, + 55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, + 59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, + 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48, + 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, + 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, + 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, + 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, + 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64, + /* Size 4x8 */ + 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46, + 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59, + /* Size 8x4 */ + 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47, + 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59, + /* Size 8x16 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31, + 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35, + 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43, + 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48, + 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, + 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, + 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, + 61, 61, + /* Size 16x8 */ + 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, + 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42, + 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47, + 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, + 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, + 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, + 59, 61, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, + 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, + 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33, + 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, + 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, + 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, + 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, + 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, + 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, + 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, + 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, + 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, + 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, + 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, + 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, + 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, + 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, + 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, + 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, + 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, + 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48, + 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, + 59, 61, 61, 62, 63, 64, 64, 65, + /* Size 32x16 */ + 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31, + 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31, + 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38, + 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40, + 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, + 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45, + 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45, + 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47, + 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35, + 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, + 41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, + 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, + 49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, + 50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, + 47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47, + 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46, + 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47, + 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48, + 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53, + 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55, + 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, + 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61, + 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48, + 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46, + 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, + 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, + 52, 54, 54, 58, 60, 60, 62, 65, + /* Size 4x16 */ + 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38, + 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46, + 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, + 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, + /* Size 16x4 */ + 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42, + 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49, + 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46, + 53, 58, 48, 46, 54, 59, 48, 46, 54, 59, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, + 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, + 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, + 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, + 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, + 46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, + 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47, + 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, + 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, + 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, + 61, 61, 61, 62, + /* Size 32x8 */ + 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31, + 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39, + 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46, + 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46, + 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, + 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, + 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, + 49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46, + 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47, + 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, + 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59, + 49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, + 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, + 52, 54, 60, 62 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54, + /* Size 8x8 */ + 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32, + 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35, + 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46, + 51, 54, 41, 39, 40, 41, 44, 49, 54, 58, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, + 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, + 40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34, + 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34, + 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, + 40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, + 45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, + 54, 58, 58, 63, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, + 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, + 34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, + 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, + 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, + 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, + 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, + 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, + 40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, + 36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, + 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, + 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33, + 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44, + 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, + 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49, + 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, + 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, + 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, + 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52, + 52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, + 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, + 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, + 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, + 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39, + 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, + 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, + 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, + 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, + 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + /* Size 4x8 */ + 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34, + 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56, + /* Size 8x4 */ + 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34, + 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34, + 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, + 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, + 58, 63, + /* Size 16x8 */ + 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, + 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, + 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, + 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, + 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, + 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, + 53, 63, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, + 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, + 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, + 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35, + 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, + 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, + 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, + 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, + 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, + 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, + 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43, + 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, + 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, + 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, + 54, 56, 58, 58, 58, 60, 63, 63, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, + 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, + 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, + 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, + 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, + 34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, + 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, + 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, + 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, + 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34, + 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36, + 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38, + 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, + 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48, + 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52, + 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, + 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38, + 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39, + 39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, + 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, + 43, 48, 53, 53, 53, 58, 63, 63, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33, + 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39, + 39, 40, 40, 45, 45, 51, 51, 54, 54, 58, + /* Size 16x4 */ + 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32, + 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40, + 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39, + 45, 54, 38, 39, 45, 54, 42, 42, 48, 58, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, + 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, + 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, + 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, + 58, 60, 63, 63, + /* Size 32x8 */ + 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32, + 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, + 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34, + 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, + 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, + 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, + 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, + 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, + 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38, + 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, + 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58, + 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, + 39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, + 43, 53, 53, 63 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54, + /* Size 8x8 */ + 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33, + 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45, + 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51, + 53, 54, 48, 46, 45, 46, 47, 51, 54, 56, + /* Size 16x16 */ + 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31, + 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31, + 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35, + 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40, + 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, + 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, + 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, + 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42, + 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46, + 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47, + 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, + 46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, + 49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, + 53, 55, 55, 58, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39, + 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, + 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, + 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42, + 42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, + 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, + 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, + 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, + 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42, + 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, + 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, + 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, + 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, + 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, + 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, + 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, + 45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37, + 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, + 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, + 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, + 46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, + 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, + 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, + 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42, + 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, + 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, + 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, + 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, + 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, + 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, + 53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, + 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, + 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, + 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, + 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, + 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, + 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, + /* Size 4x8 */ + 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42, + 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55, + /* Size 8x4 */ + 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44, + 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55, + /* Size 8x16 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31, + 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43, + 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47, + 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, + 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, + 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, + 56, 58, + /* Size 16x8 */ + 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, + 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, + 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, + 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, + 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, + 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, + 53, 58, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, + 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, + 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, + 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, + 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, + 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, + 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, + 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, + 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, + 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, + 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, + 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47, + 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, + 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, + 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, + 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48, + 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, + 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46, + 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, + 53, 54, 56, 56, 56, 57, 58, 58, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31, + 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31, + 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, + 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38, + 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42, + 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, + 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, + 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, + 32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34, + 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, + 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, + 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, + 45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, + 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, + 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, + 45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, + 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, + 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43, + 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46, + 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48, + 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, + 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53, + 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54, + 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, + 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, + 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45, + 45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, + 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, + 46, 49, 53, 53, 53, 56, 58, 58, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34, + 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42, + 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46, + 46, 46, 46, 50, 50, 53, 53, 54, 54, 56, + /* Size 16x4 */ + 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35, + 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46, + 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46, + 50, 54, 47, 46, 50, 54, 47, 45, 49, 56, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, + 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, + 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, + 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, + 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, + 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47, + 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, + 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, + 56, 57, 58, 58, + /* Size 32x8 */ + 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31, + 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, + 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46, + 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, + 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, + 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, + 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, + 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, + 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48, + 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, + 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56, + 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, + 45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, + 46, 53, 53, 58 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46, + /* Size 8x8 */ + 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, + 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34, + 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38, + 39, 42, 35, 35, 34, 36, 38, 40, 42, 48, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, + 36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33, + 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33, + 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, + 36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, + 38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, + 42, 45, 48, 48, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, + 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, + 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, + 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, + 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40, + 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, + 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34, + 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, + 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, + 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, + 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, + 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, + 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, + 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35, + 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, + 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, + 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, + 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, + 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, + 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, + 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50, + /* Size 4x8 */ + 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32, + 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48, + /* Size 8x4 */ + 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33, + 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34, + 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, + 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, + 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, + 48, 48, + /* Size 16x8 */ + 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, + 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, + 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34, + 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, + 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, + 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, + 46, 48, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, + 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, + 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, + 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, + 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, + 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, + 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, + 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, + 42, 42, 44, 47, 48, 48, 48, 49, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, + 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, + 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, + 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, + 37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, + 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, + 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33, + 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33, + 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36, + 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, + 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, + 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, + 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44, + 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35, + 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, + 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, + 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, + 39, 39, 39, 42, 46, 49, 49, 49, + /* Size 4x16 */ + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34, + 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, + /* Size 16x4 */ + 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32, + 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37, + 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34, + 37, 44, 35, 34, 38, 48, 35, 34, 38, 48, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35, + 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, + 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, + 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, + 48, 48, 48, 49, + /* Size 32x8 */ + 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32, + 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, + 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, + 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, + 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, + 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, + 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, + 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, + 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35, + 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, + 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, + 35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, + 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, + 39, 39, 46, 49 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52, + /* Size 8x8 */ + 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31, + 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42, + 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48, + 48, 50, 48, 47, 46, 47, 47, 49, 50, 53, + /* Size 16x16 */ + 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31, + 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31, + 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31, + 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35, + 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, + 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44, + 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46, + 47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38, + 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41, + 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42, + 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46, + 46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, + 48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, + 50, 51, 53, 53, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36, + 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39, + 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, + 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, + 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, + 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, + 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42, + 42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46, + 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, + 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, + 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41, + 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43, + 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, + 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, + 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, + 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, + 45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, + 37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, + 47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, + 42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39, + 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, + 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, + 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, + 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, + 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, + 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, + 50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, + 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51, + 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, + 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47, + 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, + 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, + 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, + 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + /* Size 4x8 */ + 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38, + 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53, + /* Size 8x4 */ + 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39, + 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53, + /* Size 8x16 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31, + 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32, + 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35, + 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43, + 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, + 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, + 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, + 53, 53, + /* Size 16x8 */ + 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, + 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35, + 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42, + 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, + 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, + 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, + 52, 53, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, + 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, + 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, + 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, + 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, + 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, + 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, + 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, + 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, + 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, + 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, + 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, + 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, + 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, + 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, + 50, 50, 51, 52, 53, 53, 53, 53, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31, + 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31, + 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, + 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, + 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38, + 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41, + 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46, + 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, + 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, + 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, + 33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, + 37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, + 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, + 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, + 46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, + 47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, + 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, + 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, + 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, + 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46, + 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, + 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, + 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50, + 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51, + 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48, + 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, + 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, + 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, + 47, 47, 47, 49, 52, 53, 53, 53, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, + 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39, + 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46, + 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, + /* Size 16x4 */ + 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32, + 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47, + 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44, + 47, 51, 48, 46, 48, 53, 48, 46, 48, 53, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, + 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, + 47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, + 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45, + 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, + 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, + 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, + 53, 53, 53, 53, + /* Size 32x8 */ + 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31, + 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, + 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, + 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, + 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, + 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, + 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, + 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, + 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44, + 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, + 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, + 47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, + 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, + 47, 47, 52, 53 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35, + /* Size 8x8 */ + 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, + 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34, + 35, 36, 33, 33, 33, 33, 35, 35, 36, 38, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, + 37, 37, 38, 39, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, + 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, + 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, + 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, + /* Size 4x8 */ + 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36, + /* Size 8x4 */ + 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, + 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, + 36, 38, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, + 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, + 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, + 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, + 37, 38, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, + 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, + 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, + 37, 37, 37, 37, 38, 38, 39, 39, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, + 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, + 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, + 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, + 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, + 35, 36, 37, 37, 37, 37, 38, 39, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, + /* Size 16x4 */ + 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34, + 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33, + 34, 35, 33, 33, 35, 36, 34, 34, 36, 37, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, + 36, 37, 38, 38, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, + 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, + 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, + 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, + 33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, + 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, + 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35, + 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, + 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, + 33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, + 35, 37, 37, 38 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47, + /* Size 8x8 */ + 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, + 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35, + 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44, + 47, 47, 40, 41, 41, 42, 44, 45, 47, 48, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, + 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, + 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, + 41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34, + 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37, + 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40, + 42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, + 44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, + 47, 47, 48, 48, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35, + 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, + 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41, + 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, + 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, + 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, + 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39, + 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, + 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, + 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, + 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, + 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, + 36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41, + 41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43, + 43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, + 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, + 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, + 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, + 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45, + 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, + 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, + 43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, + 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, + 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, + 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, + 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41, + 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, + 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, + 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, + 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, + /* Size 4x8 */ + 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36, + 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47, + /* Size 8x4 */ + 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36, + 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32, + 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35, + 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, + 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, + 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, + 47, 48, + /* Size 16x8 */ + 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, + 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, + 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, + 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, + 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, + 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, + 47, 48, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, + 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, + 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, + 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, + 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, + 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, + 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, + 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, + 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, + 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38, + 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, + 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, + 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, + 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, + 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, + 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, + 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, + 47, 47, 47, 47, 48, 48, 48, 48, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, + 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, + 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, + 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42, + 30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31, + 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, + 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, + 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, + 35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, + 40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, + 41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, + 43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, + 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, + 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35, + 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37, + 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39, + 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44, + 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, + 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, + 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, + 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39, + 40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42, + 42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, + 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, + 44, 46, 47, 47, 47, 47, 48, 48, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31, + 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36, + 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, + /* Size 16x4 */ + 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32, + 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42, + 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40, + 45, 47, 39, 41, 45, 47, 42, 43, 46, 47, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, + 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, + 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, + 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, + 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37, + 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, + 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, + 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, + 47, 47, 48, 48, + /* Size 32x8 */ + 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31, + 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, + 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, + 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40, + 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, + 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, + 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, + 41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, + 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38, + 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47, + 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, + 42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, + 44, 47, 47, 48 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + /* Size 8x4 */ + 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, + 33, 34, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 34, 34, 34, 34, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + /* Size 16x4 */ + 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, + 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, + 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, + 32, 32, 33, 34 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39, + /* Size 8x8 */ + 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31, + 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31, + 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36, + 39, 39, 33, 34, 34, 35, 35, 36, 39, 39, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, + 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, + 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, + 38, 39, 39, 39, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, + 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34, + 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, + 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, + 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, + 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35, + 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, + 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, + 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36, + 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, + 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, + 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, + 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, + 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31, + 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40, + /* Size 8x4 */ + 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, + 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, + 41, 41, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, + 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, + 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, + 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, + 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, + 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, + 38, 41, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, + 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, + 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, + 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, + 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, + 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35, + 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, + 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, + 42, 43, 43, 43, 43, 43, 43, 44, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, + 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, + 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, + 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, + 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33, + 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33, + 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, + 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, + 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, + 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, + 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, + 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, + 36, 36, 36, 38, 39, 40, 42, 44, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 36, 37, 39, 40, 40, 40, + /* Size 16x4 */ + 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31, + 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, + 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36, + 36, 40, 34, 36, 36, 40, 34, 36, 36, 40, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, + 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, + 41, 41, 41, 42, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31, + 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, + 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, + 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, + 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, + 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, + 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, + 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33, + 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35, + 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, + 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, + 36, 36, 39, 42 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, + 32, 32, 31, 31, 32, 32, 30, 31, 32, 32, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, + 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, + 32, 32, 32, 32 }, + }, +}; + +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { + { + { /* Luma */ + /* Size 4x4 */ + 32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5, + /* Size 8x8 */ + 32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26, + 19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9, + 8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10, + 10, 8, 7, 6, 5, 5, + /* Size 16x16 */ + 32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32, + 32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29, + 28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22, + 20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16, + 15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11, + 11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9, + 9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16, + 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, + 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7, + 7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6, + 11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11, + 11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7, + 6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, + 32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12, + 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28, + 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10, + 10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19, + 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, + 32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30, + 29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19, + 18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28, + 27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, + 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22, + 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15, + 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20, + 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, + 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, + 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17, + 17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16, + 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, + 9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, + 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14, + 14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, + 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, + 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, + 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9, + 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, + 5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, + 10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, + 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, + /* Size 4x8 */ + 32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15, + 12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5, + /* Size 8x4 */ + 32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12, + 8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5, + /* Size 8x16 */ + 32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31, + 30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24, + 21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14, + 13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10, + 9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6, + 7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12, + 12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5, + /* Size 16x8 */ + 32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30, + 28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17, + 15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10, + 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13, + 15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8, + 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10, + 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, + 31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, + 12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27, + 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25, + 25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18, + 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, + 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, + 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17, + 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, + 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12, + 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, + 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, + 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, + 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, + 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, + 5, 5, 5, 5, + /* Size 32x16 */ + 32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32, + 32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31, + 30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25, + 21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19, + 17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15, + 14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13, + 13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12, + 12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11, + 26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25, + 24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21, + 19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15, + 13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12, + 11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9, + 9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9, + 16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15, + 14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9, + 9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7, + 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, + 13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9, + 8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6, + 6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12, + 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, + 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, + 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10, + 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7, + 6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, + /* Size 4x16 */ + 33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25, + 24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14, + 13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8, + 8, 7, 6, 6, 6, 5, 5, + /* Size 16x4 */ + 33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19, + 13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14, + 11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9, + 7, 5, 9, 9, 7, 5, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30, + 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21, + 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16, + 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12, + 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, + 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, + 5, 5, + /* Size 32x8 */ + 32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31, + 30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20, + 17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13, + 13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11, + 26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23, + 19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12, + 11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9, + 16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10, + 9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12, + 13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7, + 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10, + 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9, + 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6, + 5 }, + { /* Chroma */ + /* Size 4x4 */ + 29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9, + /* Size 8x8 */ + 33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22, + 19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14, + 12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11, + 10, 10, 14, 15, 15, 14, 12, 11, 10, 9, + /* Size 16x16 */ + 32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32, + 29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23, + 22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21, + 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15, + 15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14, + 14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13, + 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, + 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18, + 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, + 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15, + 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, + 12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10, + 9, 9, 9, + /* Size 32x32 */ + 32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31, + 30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23, + 22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22, + 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, + 15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20, + 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27, + 26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22, + 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, + 16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23, + 23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20, + 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, + 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, + 15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20, + 21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21, + 20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, + 13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17, + 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16, + 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15, + 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14, + 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, + 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14, + 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, + 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + /* Size 4x8 */ + 33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19, + 16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10, + /* Size 8x4 */ + 33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16, + 12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10, + /* Size 8x16 */ + 32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26, + 24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20, + 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17, + 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14, + 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10, + 10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, + 10, 9, + /* Size 16x8 */ + 32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24, + 22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, + 18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14, + 14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13, + 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17, + 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, + 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11, + 10, 9, + /* Size 16x32 */ + 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30, + 28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22, + 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, + 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, + 16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22, + 23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, + 15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22, + 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17, + 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, + 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, + 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17, + 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10, + 10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, + 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, + 10, 10, 10, 10, 9, 9, 9, + /* Size 32x16 */ + 32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33, + 27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24, + 22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23, + 21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21, + 20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19, + 18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18, + 17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, + 17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17, + 21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23, + 22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21, + 19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, + 16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15, + 14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13, + 13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13, + 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13, + 13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13, + 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18, + 19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18, + 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15, + 13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13, + 12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, + 10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10, + 10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, + 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, + 15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, + 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14, + 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13, + 13, 11, 11, 10, 10, 9, 9, 9, + /* Size 4x16 */ + 33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23, + 22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18, + 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16, + 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, + /* Size 16x4 */ + 33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19, + 17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13, + 18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15, + 12, 10, 15, 15, 12, 10, 15, 14, 12, 10, + /* Size 8x32 */ + 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25, + 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, + 16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16, + 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, + 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, + 10, 9, 9, 9, + /* Size 32x8 */ + 32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26, + 22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22, + 20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17, + 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, + 21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22, + 19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16, + 14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12, + 13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13, + 16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, + 17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14, + 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, + 10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, + 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15, + 16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14, + 13, 11, 10, 9 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6, + /* Size 8x8 */ + 32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26, + 20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12, + 10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10, + 11, 10, 9, 8, 7, 6, 5, + /* Size 16x16 */ + 32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32, + 32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30, + 28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24, + 22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17, + 16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13, + 12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10, + 10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8, + 17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, + 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, + 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7, + 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, + 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11, + 10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7, + 6, 6, 5, 5, 5, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, + 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30, + 28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23, + 23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18, + 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32, + 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, + 22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, + 11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, + 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27, + 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26, + 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19, + 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, + 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12, + 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20, + 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, + 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, + 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10, + 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15, + 15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, + 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12, + 12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6, + 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12, + 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, + 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, + 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, + 5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10, + 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, + 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, + /* Size 4x8 */ + 32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16, + 13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6, + /* Size 8x4 */ + 32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13, + 9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6, + /* Size 8x16 */ + 32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32, + 30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26, + 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10, + 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, + 7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12, + 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, + /* Size 16x8 */ + 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30, + 28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18, + 15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11, + 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14, + 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, + 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, + 11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28, + 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, + 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16, + 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24, + 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, + 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, + 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, + 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, + 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11, + 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, + 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, + 7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, + 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, + 5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, + 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, + /* Size 32x16 */ + 32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32, + 32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31, + 29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25, + 23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20, + 17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16, + 15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13, + 13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13, + 12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, + 27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28, + 26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23, + 19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16, + 15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13, + 12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10, + 10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9, + 9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, + 17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12, + 12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8, + 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8, + 12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, + 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8, + 8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, + 6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12, + 12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9, + 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, + 6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10, + 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8, + 7, 7, 6, 6, 5, 5, 5, + /* Size 4x16 */ + 33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25, + 24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16, + 14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10, + 9, 8, 7, 7, 6, 6, 6, 6, + /* Size 16x4 */ + 33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19, + 14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9, + 16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6, + 11, 10, 8, 6, 10, 9, 7, 6, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31, + 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24, + 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, + 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, + 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, + 6, 6, 6, 6, 5, 5, 5, + /* Size 32x8 */ + 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32, + 29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23, + 17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14, + 13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12, + 27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24, + 19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14, + 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9, + 9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14, + 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, + 12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10, + 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10, + 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, + 6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, + 8, 7, 6, 5 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10, + /* Size 8x8 */ + 33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22, + 19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15, + 13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11, + 10, 10, 15, 16, 16, 14, 12, 11, 10, 9, + /* Size 16x16 */ + 32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33, + 29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23, + 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21, + 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17, + 16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13, + 13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, + 17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, + 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18, + 17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15, + 14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, + 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12, + 12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11, + 10, 10, 9, 9, + /* Size 32x32 */ + 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32, + 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17, + 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24, + 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23, + 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, + 15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21, + 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27, + 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22, + 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, + 21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, + 17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, + 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23, + 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21, + 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, + 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, + 14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21, + 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18, + 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18, + 19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, + 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, + 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, + 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, + 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, + 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, + 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16, + 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16, + 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, + 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + /* Size 4x8 */ + 33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20, + 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10, + /* Size 8x4 */ + 33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16, + 13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10, + /* Size 8x16 */ + 32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27, + 25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21, + 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14, + 14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, + 12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11, + 11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, + 10, 10, + /* Size 16x8 */ + 32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25, + 22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19, + 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15, + 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13, + 17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18, + 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14, + 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12, + 10, 10, + /* Size 16x32 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31, + 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22, + 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22, + 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, + 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, + 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, + 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, + 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14, + 14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, + 17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17, + 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, + 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17, + 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 10, 10, 10, 10, 10, 10, 9, + /* Size 32x16 */ + 32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33, + 28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26, + 22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23, + 22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20, + 19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18, + 18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17, + 17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17, + 21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22, + 22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, + 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17, + 17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16, + 15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14, + 14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13, + 13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13, + 13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13, + 17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18, + 19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18, + 17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15, + 14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13, + 12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12, + 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11, + 11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10, + 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, + 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, + 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15, + 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13, + 13, 12, 12, 11, 11, 10, 10, 9, + /* Size 4x16 */ + 33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23, + 22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19, + 17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16, + 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, + /* Size 16x4 */ + 33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19, + 17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13, + 19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16, + 12, 10, 16, 15, 12, 10, 15, 15, 12, 10, + /* Size 8x32 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27, + 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, + 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17, + 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, + 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 10, 10, 10, + /* Size 32x8 */ + 32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27, + 22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22, + 20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19, + 18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, + 21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23, + 19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17, + 15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13, + 13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13, + 17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19, + 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14, + 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, + 11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10, + 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, + 16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14, + 13, 12, 11, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6, + /* Size 8x8 */ + 32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28, + 21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13, + 11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11, + 11, 11, 10, 8, 7, 6, 6, + /* Size 16x16 */ + 32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, + 32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30, + 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26, + 24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19, + 17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14, + 13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11, + 11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10, + 9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17, + 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15, + 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, + 9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7, + 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11, + 12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10, + 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, + 32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27, + 25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, + 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, + 32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17, + 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30, + 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, + 14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24, + 23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19, + 19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, + 28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28, + 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, + 12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22, + 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17, + 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22, + 22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21, + 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, + 9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, + 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18, + 19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, + 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, + 16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, + 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12, + 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, + 12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, + 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, + 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, + 11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11, + 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 6, 5, 5, 5, + /* Size 4x8 */ + 32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18, + 15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6, + /* Size 8x4 */ + 32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15, + 10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6, + /* Size 8x16 */ + 32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32, + 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27, + 23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18, + 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13, + 11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8, + 7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11, + 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 16x8 */ + 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31, + 28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20, + 17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11, + 11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16, + 17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9, + 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12, + 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14, + 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29, + 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25, + 24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, + 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27, + 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, + 13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, + 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9, + 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17, + 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12, + 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13, + 14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, + 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, + 11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, + /* Size 32x16 */ + 32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32, + 32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32, + 29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28, + 25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20, + 19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18, + 15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15, + 13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13, + 13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12, + 28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28, + 26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25, + 20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18, + 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14, + 13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12, + 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10, + 9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9, + 17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, + 18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13, + 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9, + 8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8, + 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14, + 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11, + 9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7, + 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12, + 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11, + 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7, + 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, + 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, + /* Size 4x16 */ + 33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27, + 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18, + 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11, + 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 16x4 */ + 33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22, + 16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9, + 17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6, + 12, 11, 8, 6, 11, 10, 8, 6, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32, + 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, + 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17, + 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14, + 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, + 8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, + 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, + 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 6, 6, 6, 6, 6, 6, 6, + /* Size 32x8 */ + 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32, + 29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24, + 19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15, + 13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12, + 28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26, + 20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16, + 13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10, + 9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17, + 15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8, + 8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14, + 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7, + 6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12, + 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, + 10, 11, 11, 10, 9, 8, 7, 6 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10, + /* Size 8x8 */ + 33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22, + 19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16, + 14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12, + 11, 10, 15, 16, 16, 14, 13, 12, 10, 10, + /* Size 16x16 */ + 32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33, + 29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24, + 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22, + 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20, + 19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15, + 15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14, + 14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, + 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19, + 19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19, + 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12, + 12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, + 11, 10, 10, 10, + /* Size 32x32 */ + 32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33, + 30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24, + 23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23, + 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, + 21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28, + 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23, + 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17, + 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, + 22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, + 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21, + 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, + 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, + 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, + 13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19, + 19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, + 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, + 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, + 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, + 10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, + 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16, + 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, + 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, + /* Size 4x8 */ + 33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20, + 18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10, + /* Size 8x4 */ + 33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17, + 14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10, + /* Size 8x16 */ + 32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28, + 26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22, + 20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16, + 15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, + 12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10, + 10, 10, + /* Size 16x8 */ + 32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26, + 22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20, + 19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15, + 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13, + 18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18, + 17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14, + 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, + 11, 10, + /* Size 16x32 */ + 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32, + 28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23, + 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, + 17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22, + 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, + 22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23, + 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, + 15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19, + 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14, + 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, + 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, + 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16, + 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 10, + /* Size 32x16 */ + 32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33, + 29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26, + 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23, + 23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22, + 21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20, + 19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19, + 18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18, + 17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17, + 21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22, + 22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22, + 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18, + 18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16, + 16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, + 14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14, + 13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13, + 13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13, + 18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19, + 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20, + 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12, + 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11, + 11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, + 11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11, + 15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, + 16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, + 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15, + 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13, + 13, 12, 12, 11, 11, 10, 10, 10, + /* Size 4x16 */ + 33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, + 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20, + 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 11, 10, 10, + /* Size 16x4 */ + 33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20, + 19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13, + 20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16, + 12, 11, 16, 16, 13, 10, 16, 15, 12, 10, + /* Size 8x32 */ + 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28, + 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, + 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, + 15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, + 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17, + 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18, + 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 10, 10, + /* Size 32x8 */ + 32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28, + 22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22, + 21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19, + 18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, + 21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22, + 19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17, + 16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14, + 13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13, + 18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19, + 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15, + 13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11, + 11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10, + 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, + 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, + 13, 12, 11, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7, + /* Size 8x8 */ + 32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28, + 22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14, + 12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7, + 11, 12, 12, 10, 9, 8, 7, 6, + /* Size 16x16 */ + 32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32, + 32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31, + 29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27, + 25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20, + 19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16, + 15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12, + 12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10, + 10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17, + 19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18, + 16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12, + 11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, + 8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, + 11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12, + 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18, + 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13, + 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, + 27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, + 12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23, + 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32, + 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30, + 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, + 26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, + 13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, + 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12, + 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17, + 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30, + 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24, + 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23, + 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23, + 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, + 10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, + 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, + 9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19, + 19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, + 16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13, + 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, + 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, + 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, + 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, + 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, + 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, + 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, + 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + /* Size 4x8 */ + 32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18, + 16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7, + /* Size 8x4 */ + 32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17, + 11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7, + /* Size 8x16 */ + 32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32, + 31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28, + 25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19, + 17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14, + 12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10, + 9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, + 11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, + /* Size 16x8 */ + 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31, + 29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21, + 19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13, + 12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9, + 18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14, + 12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7, + 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, + 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16, + 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30, + 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, + 25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29, + 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, + 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, + 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17, + 17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, + 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14, + 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12, + 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, + 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, + /* Size 32x16 */ + 32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32, + 32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32, + 30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30, + 25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23, + 21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18, + 17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15, + 14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13, + 13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13, + 29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30, + 28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26, + 21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20, + 17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16, + 14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12, + 12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11, + 10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10, + 9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18, + 19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18, + 16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13, + 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9, + 8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, + 13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, + 13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11, + 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, + 7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12, + 12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11, + 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, + 7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 4x16 */ + 33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29, + 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18, + 17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11, + 10, 10, 9, 8, 8, 7, 7, 7, 6, + /* Size 16x4 */ + 33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24, + 17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10, + 19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7, + 12, 12, 9, 7, 12, 11, 8, 6, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, + 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, + 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26, + 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, + 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, + 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, + 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, + 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, + 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, + /* Size 32x8 */ + 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32, + 30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24, + 21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18, + 14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13, + 29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26, + 21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16, + 14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12, + 10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18, + 19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13, + 11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8, + 13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13, + 11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, + 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11, + 9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11, + /* Size 8x8 */ + 33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22, + 20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17, + 15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12, + 11, 11, 16, 17, 17, 15, 13, 12, 11, 10, + /* Size 16x16 */ + 32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33, + 30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24, + 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21, + 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20, + 20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19, + 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, + 16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14, + 14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14, + 19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19, + 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, + 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, + 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14, + 13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12, + 11, 11, 10, 10, + /* Size 32x32 */ + 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33, + 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26, + 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23, + 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30, + 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24, + 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, + 19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, + 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18, + 18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, + 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, + 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, + 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, + 15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, + 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22, + 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19, + 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, + 14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, + 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, + 13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, + 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, + 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, + 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, + 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + /* Size 4x8 */ + 33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20, + 19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11, + /* Size 8x4 */ + 33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19, + 14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11, + /* Size 8x16 */ + 32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30, + 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22, + 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19, + 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14, + 13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, + 11, 10, + /* Size 16x8 */ + 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28, + 22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21, + 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17, + 16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, + 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19, + 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, + 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12, + 11, 10, + /* Size 16x32 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32, + 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24, + 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, + 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22, + 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15, + 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, + 18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, + 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20, + 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, + 11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, + 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16, + 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17, + 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, + /* Size 32x16 */ + 32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33, + 30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26, + 23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22, + 23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22, + 22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20, + 20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19, + 19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18, + 18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22, + 22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22, + 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17, + 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, + 16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14, + 14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14, + 14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13, + 19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20, + 20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20, + 19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18, + 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15, + 14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13, + 12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, + 11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11, + 11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, + 15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17, + 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17, + 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14, + 14, 12, 12, 11, 11, 10, 10, 10, + /* Size 4x16 */ + 33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22, + 22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16, + 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, + /* Size 16x4 */ + 33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20, + 19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14, + 20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16, + 12, 11, 17, 16, 13, 11, 16, 16, 13, 11, + /* Size 8x32 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29, + 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19, + 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, + 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 10, 10, + /* Size 32x8 */ + 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30, + 23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23, + 22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, + 19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17, + 22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, + 20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18, + 17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15, + 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14, + 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20, + 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16, + 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12, + 11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11, + 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, + 17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15, + 14, 12, 11, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7, + /* Size 8x8 */ + 32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29, + 26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16, + 13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8, + 7, 12, 13, 13, 11, 10, 8, 7, 7, + /* Size 16x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32, + 32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31, + 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28, + 26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22, + 20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14, + 13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, + 11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10, + 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18, + 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16, + 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12, + 11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8, + 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11, + 12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20, + 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17, + 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14, + 13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, + 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, + 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, + 32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19, + 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31, + 30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16, + 16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28, + 27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14, + 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, + 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, + 30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19, + 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30, + 29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27, + 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20, + 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24, + 24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23, + 23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20, + 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, + 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19, + 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, + 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17, + 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14, + 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13, + 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12, + 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12, + 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, + 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, + /* Size 4x8 */ + 32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20, + 19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7, + /* Size 8x4 */ + 32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17, + 13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7, + /* Size 8x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32, + 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28, + 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20, + 19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15, + 14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11, + 10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, + 7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, + /* Size 16x8 */ + 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, + 30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23, + 20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14, + 12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, + 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16, + 14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8, + 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, + 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, + 14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27, + 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, + 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21, + 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29, + 29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26, + 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21, + 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, + 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, + 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17, + 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, + 8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14, + 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, + 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, + 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, + /* Size 32x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32, + 32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32, + 31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30, + 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25, + 22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20, + 17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17, + 15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15, + 14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13, + 30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30, + 29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27, + 24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20, + 19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17, + 16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14, + 13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12, + 11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10, + 10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10, + 19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19, + 19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18, + 16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13, + 12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10, + 10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8, + 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13, + 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10, + 10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7, + 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12, + 12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11, + 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, + /* Size 4x16 */ + 33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29, + 29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21, + 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13, + 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, + /* Size 16x4 */ + 33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25, + 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10, + 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10, + 7, 13, 12, 9, 7, 12, 12, 9, 7, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, + 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28, + 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, + 14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20, + 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14, + 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17, + 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, + 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, + /* Size 32x8 */ + 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, + 31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27, + 22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18, + 15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13, + 30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29, + 24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18, + 16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12, + 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10, + 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18, + 15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10, + 9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, + 14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8, + 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, + 13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11, + /* Size 8x8 */ + 33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23, + 21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18, + 16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12, + 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, + /* Size 16x16 */ + 32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33, + 32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26, + 24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22, + 22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21, + 21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19, + 19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, + 17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, + 15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20, + 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17, + 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15, + 14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13, + 13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12, + 12, 11, 11, 10, + /* Size 32x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33, + 32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26, + 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22, + 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30, + 29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25, + 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22, + 21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, + 18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21, + 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, + 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, + 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, + 22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22, + 22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, + 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21, + 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, + 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, + 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, + 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, + 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, + 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18, + 18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, + 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, + 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, + 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17, + 17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, + /* Size 4x8 */ + 33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22, + 20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11, + /* Size 8x4 */ + 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19, + 16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11, + /* Size 8x16 */ + 32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31, + 29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22, + 21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14, + 14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, + 12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, + 11, 11, + /* Size 16x8 */ + 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29, + 24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, + 21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17, + 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, + 20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20, + 19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16, + 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, + 12, 11, + /* Size 16x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32, + 31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20, + 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25, + 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22, + 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, + 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20, + 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, + 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, + 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 11, 10, + /* Size 32x16 */ + 32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33, + 31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27, + 25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22, + 23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23, + 22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22, + 20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20, + 19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19, + 18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18, + 24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22, + 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21, + 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17, + 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16, + 15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, + 14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, + 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20, + 21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21, + 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18, + 17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16, + 15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14, + 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13, + 12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, + 11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11, + 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17, + 17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, + 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, + 16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14, + 14, 13, 13, 12, 12, 11, 11, 10, + /* Size 4x16 */ + 33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22, + 22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22, + 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, + /* Size 16x4 */ + 33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21, + 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14, + 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17, + 14, 11, 17, 17, 13, 11, 17, 16, 13, 11, + /* Size 8x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31, + 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22, + 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, + 18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, + 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, + 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19, + 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18, + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, + /* Size 32x8 */ + 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31, + 25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22, + 22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, + 19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17, + 24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, + 20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19, + 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16, + 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14, + 20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20, + 20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17, + 15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13, + 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11, + 16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, + 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16, + 14, 13, 12, 11 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8, + /* Size 8x8 */ + 32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29, + 26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17, + 14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10, + 9, 8, 13, 14, 13, 12, 10, 9, 8, 7, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32, + 32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32, + 32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29, + 28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26, + 24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20, + 19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13, + 12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11, + 21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20, + 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19, + 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12, + 11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, + 9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, + 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15, + 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, + 14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, + 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, + 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31, + 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17, + 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, + 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, + 15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, + 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, + 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, + 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31, + 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18, + 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27, + 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27, + 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24, + 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, + 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, + 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, + 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14, + 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21, + 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21, + 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, + 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17, + 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, + 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16, + 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, + 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, + 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14, + 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12, + 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + /* Size 4x8 */ + 32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20, + 19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8, + /* Size 8x4 */ + 32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18, + 13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8, + /* Size 8x16 */ + 32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32, + 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30, + 29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18, + 16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13, + 12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10, + 10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, + /* Size 16x8 */ + 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, + 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27, + 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17, + 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11, + 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19, + 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14, + 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8, + 8, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, + 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, + 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, + 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, + 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, + 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29, + 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, + 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, + 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, + 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, + 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, + 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13, + 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12, + 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + /* Size 32x16 */ + 32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32, + 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32, + 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30, + 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25, + 25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, + 20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17, + 17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, + 15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14, + 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30, + 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27, + 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21, + 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18, + 18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16, + 16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, + 13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11, + 11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, + 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21, + 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20, + 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16, + 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14, + 12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, + 10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, + 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16, + 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15, + 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11, + 11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, + 8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8, + 7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7, + /* Size 4x16 */ + 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29, + 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20, + 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14, + 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, + /* Size 16x4 */ + 33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27, + 21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, + 23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14, + 11, 8, 14, 13, 10, 8, 14, 13, 10, 8, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, + 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20, + 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, + 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, + 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, + /* Size 32x8 */ + 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, + 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30, + 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20, + 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15, + 30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, + 27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20, + 18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, + 13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11, + 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21, + 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16, + 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11, + 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13, + 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, + 11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, + 7 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12, + /* Size 8x8 */ + 33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23, + 21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18, + 17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14, + 12, 12, 17, 18, 18, 16, 14, 13, 12, 11, + /* Size 16x16 */ + 32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, + 33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29, + 26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23, + 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22, + 23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20, + 20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, + 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17, + 16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, + 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20, + 22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21, + 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19, + 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, + 14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, + 12, 12, 11, 11, + /* Size 32x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33, + 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27, + 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, + 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, + 18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, + 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, + 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26, + 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, + 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, + 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18, + 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, + 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24, + 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, + 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, + 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, + 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, + 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, + 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22, + 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, + 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, + 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19, + 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, + 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, + 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + /* Size 4x8 */ + 33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22, + 20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11, + /* Size 8x4 */ + 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19, + 16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11, + /* Size 8x16 */ + 32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, + 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24, + 22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20, + 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18, + 18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14, + 13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, + 12, 11, + /* Size 16x8 */ + 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32, + 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22, + 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19, + 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15, + 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20, + 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18, + 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14, + 12, 11, + /* Size 16x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26, + 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, + 18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, + 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, + 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21, + 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, + 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17, + 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 11, + /* Size 32x16 */ + 32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33, + 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27, + 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22, + 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23, + 23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, + 22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20, + 20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, + 19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18, + 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24, + 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21, + 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19, + 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18, + 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18, + 18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, + 16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21, + 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22, + 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19, + 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17, + 17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, + 15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13, + 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18, + 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18, + 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17, + 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15, + 15, 13, 13, 12, 12, 11, 11, 11, + /* Size 4x16 */ + 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22, + 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22, + 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18, + 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, + /* Size 16x4 */ + 33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22, + 22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, + 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18, + 14, 12, 18, 17, 14, 12, 17, 17, 14, 11, + /* Size 8x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20, + 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, + 12, 11, 11, 11, + /* Size 32x8 */ + 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33, + 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22, + 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22, + 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19, + 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, + 21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19, + 18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, + 16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14, + 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21, + 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19, + 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15, + 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, + 16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, + 18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16, + 15, 13, 12, 11 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9, + /* Size 8x8 */ + 33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31, + 29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19, + 16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12, + 10, 9, 15, 16, 16, 14, 12, 11, 9, 9, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32, + 32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32, + 32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29, + 28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26, + 25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21, + 20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17, + 17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, + 13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12, + 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22, + 23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20, + 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18, + 17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14, + 14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, + 11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9, + 9, 8, 8, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, + 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17, + 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28, + 27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23, + 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20, + 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, + 26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15, + 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24, + 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31, + 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, + 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29, + 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, + 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14, + 13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, + 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27, + 26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25, + 22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, + 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, + 23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23, + 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23, + 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19, + 18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19, + 19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16, + 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, + 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, + 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, + 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, + 9, 9, 9, 9, 8, 8, 8, 8, + /* Size 4x8 */ + 32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24, + 23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9, + /* Size 8x4 */ + 32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21, + 16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9, + /* Size 8x16 */ + 32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32, + 32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30, + 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18, + 17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11, + 11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, + 8, + /* Size 16x8 */ + 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, + 31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27, + 24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, + 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13, + 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, + 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15, + 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9, + 8, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21, + 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18, + 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, + 28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16, + 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, + 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, + 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29, + 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26, + 23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, + 23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20, + 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, + 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17, + 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, + 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, + 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, + 8, 8, + /* Size 32x16 */ + 32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32, + 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, + 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31, + 29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28, + 25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24, + 21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20, + 18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17, + 16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15, + 32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30, + 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28, + 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24, + 22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21, + 19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18, + 16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16, + 14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13, + 13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, + 23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23, + 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22, + 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19, + 17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16, + 14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14, + 12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10, + 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15, + 17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16, + 16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15, + 14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, + 11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10, + 9, 9, 8, 8, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30, + 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24, + 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16, + 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, + /* Size 16x4 */ + 33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28, + 24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13, + 24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16, + 13, 10, 16, 15, 12, 9, 14, 14, 11, 9, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, + 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, + 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, + 18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20, + 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, + 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 8, 8, + /* Size 32x8 */ + 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, + 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30, + 25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, + 18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16, + 32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30, + 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21, + 19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16, + 14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12, + 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23, + 22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17, + 14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, + 11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, + 15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16, + 14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, + 9, 8 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13, + /* Size 8x8 */ + 33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24, + 22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19, + 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15, + 14, 13, 18, 19, 20, 18, 16, 14, 13, 12, + /* Size 16x16 */ + 32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, + 33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29, + 26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23, + 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22, + 22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21, + 20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19, + 19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, + 21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21, + 22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22, + 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20, + 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18, + 17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16, + 15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14, + 13, 12, 12, 12, + /* Size 32x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33, + 33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29, + 27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24, + 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, + 18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, + 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32, + 31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28, + 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21, + 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, + 28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, + 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24, + 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23, + 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, + 17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22, + 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, + 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, + 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, + 14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20, + 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21, + 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, + 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, + 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, + 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, + 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, + 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19, + 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, + 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, + 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, + /* Size 4x8 */ + 33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23, + 22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13, + /* Size 8x4 */ + 33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21, + 17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13, + /* Size 8x16 */ + 32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, + 32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24, + 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15, + 14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13, + 12, 12, + /* Size 16x8 */ + 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32, + 26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22, + 23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, + 19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16, + 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21, + 22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, + 16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, + 13, 12, + /* Size 16x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, + 32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28, + 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, + 20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, + 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24, + 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, + 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, + 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21, + 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22, + 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, + 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, + 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, + 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 13, 12, 12, 12, 12, 12, 12, + /* Size 32x16 */ + 32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33, + 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28, + 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24, + 22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22, + 23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23, + 21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22, + 20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20, + 19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, + 28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24, + 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22, + 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19, + 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19, + 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, + 17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, + 16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, + 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22, + 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22, + 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20, + 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17, + 15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15, + 14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14, + 13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, + 18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19, + 19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19, + 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18, + 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17, + 15, 15, 14, 14, 13, 12, 12, 12, + /* Size 4x16 */ + 33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24, + 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22, + 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19, + 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, + /* Size 16x4 */ + 33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22, + 22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17, + 22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19, + 16, 13, 19, 19, 16, 13, 18, 18, 15, 12, + /* Size 8x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, + 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, + 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, + 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, + 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20, + 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, + 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, + 12, 12, 12, 12, + /* Size 32x8 */ + 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33, + 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22, + 23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, + 20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20, + 28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, + 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19, + 19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18, + 17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16, + 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22, + 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, + 17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15, + 14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13, + 18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, + 19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17, + 15, 14, 13, 12 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11, + /* Size 8x8 */ + 33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32, + 29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22, + 19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14, + 12, 11, 16, 17, 18, 16, 14, 12, 11, 10, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, + 32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32, + 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31, + 29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28, + 26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25, + 24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20, + 19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16, + 16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14, + 25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24, + 25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23, + 23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21, + 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16, + 15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12, + 11, 11, 10, 10, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22, + 22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20, + 19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, + 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25, + 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, + 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, + 30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, + 18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, + 28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, + 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, + 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29, + 29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, + 18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26, + 26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23, + 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29, + 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30, + 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26, + 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, + 15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19, + 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24, + 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24, + 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, + 14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21, + 20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18, + 17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, + 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, + 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, + 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, + 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, + 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18, + 18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17, + 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, + /* Size 4x8 */ + 32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25, + 24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11, + /* Size 8x4 */ + 32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26, + 18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11, + /* Size 8x16 */ + 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, + 32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30, + 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28, + 25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20, + 20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17, + 16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13, + 12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, + 10, 10, + /* Size 16x8 */ + 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32, + 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28, + 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21, + 19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, + 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23, + 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, + 15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13, + 11, 10, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23, + 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, + 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, + 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, + 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, + 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32, + 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, + 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, + 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, + 18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, + 27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, + 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25, + 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21, + 21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, + 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, + 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16, + 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16, + 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, + 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 9, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32, + 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32, + 32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32, + 30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, + 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25, + 25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21, + 21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19, + 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18, + 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32, + 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29, + 28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28, + 25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23, + 22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, + 19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18, + 17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15, + 14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14, + 25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25, + 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24, + 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23, + 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18, + 17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15, + 15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, + 12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, + 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, + 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18, + 18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17, + 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14, + 13, 12, 12, 11, 11, 10, 9, 9, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32, + 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24, + 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20, + 18, 16, 15, 15, 14, 13, 12, 11, 11, 11, + /* Size 16x4 */ + 33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30, + 24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15, + 26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19, + 14, 11, 18, 18, 13, 11, 17, 18, 13, 11, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, + 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, + 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, + 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24, + 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21, + 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, + 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 9, + /* Size 32x8 */ + 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32, + 32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30, + 28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, + 21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18, + 32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31, + 28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24, + 22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19, + 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14, + 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25, + 24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19, + 17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, + 12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11, + 18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, + 18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15, + 13, 12, 11, 9 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14, + /* Size 8x8 */ + 33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26, + 22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20, + 19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16, + 16, 15, 19, 20, 20, 19, 17, 16, 15, 13, + /* Size 16x16 */ + 32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, + 33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30, + 28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24, + 23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22, + 22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22, + 23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20, + 20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, + 21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, + 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, + 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17, + 16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, + 15, 14, 14, 13, + /* Size 32x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, + 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30, + 28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, + 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, + 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33, + 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30, + 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, + 22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, + 20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, + 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26, + 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23, + 23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21, + 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19, + 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, + 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, + 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, + 16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, + 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20, + 20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, + 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, + 13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, + /* Size 4x8 */ + 33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23, + 23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14, + /* Size 8x4 */ + 33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22, + 18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14, + /* Size 8x16 */ + 32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, + 32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24, + 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22, + 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19, + 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18, + 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, + 16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14, + 14, 13, + /* Size 16x8 */ + 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32, + 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22, + 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21, + 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, + 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22, + 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, + 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16, + 15, 13, + /* Size 16x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, + 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28, + 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, + 21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, + 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, + 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, + 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, + 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, + 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20, + 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, + 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19, + 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, + 14, 14, 14, 14, 14, 13, 13, 13, + /* Size 32x16 */ + 32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33, + 33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30, + 27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26, + 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22, + 23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23, + 23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22, + 22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, + 20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, + 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26, + 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23, + 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22, + 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20, + 20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17, + 21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22, + 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23, + 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19, + 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17, + 17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, + 16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15, + 14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, + 19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20, + 20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, + 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20, + 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18, + 17, 16, 16, 15, 14, 14, 13, 13, + /* Size 4x16 */ + 33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27, + 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23, + 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21, + 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, + /* Size 16x4 */ + 33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23, + 23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18, + 22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21, + 17, 15, 20, 21, 16, 14, 19, 20, 16, 14, + /* Size 8x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32, + 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, + 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22, + 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21, + 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, + 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, + 14, 13, 13, 13, + /* Size 32x8 */ + 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33, + 27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23, + 23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, + 22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20, + 28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24, + 22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20, + 20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19, + 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17, + 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23, + 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, + 18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, + 16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14, + 19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, + 20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18, + 17, 16, 14, 13 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13, + /* Size 8x8 */ + 33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32, + 30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24, + 21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16, + 14, 13, 19, 20, 20, 19, 17, 15, 13, 12, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32, + 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32, + 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31, + 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29, + 28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, + 26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, + 22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20, + 19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, + 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27, + 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25, + 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, + 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20, + 18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16, + 16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14, + 13, 13, 12, 11, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25, + 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, + 22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20, + 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, + 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28, + 28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, + 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, + 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, + 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22, + 22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, + 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, + 19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, + 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30, + 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, + 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, + 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24, + 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20, + 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27, + 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26, + 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24, + 23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, + 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20, + 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, + 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, + 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14, + 13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, + 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, + 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, + 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20, + 20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, + /* Size 4x8 */ + 32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28, + 27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12, + /* Size 8x4 */ + 32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27, + 21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12, + /* Size 8x16 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32, + 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31, + 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28, + 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24, + 21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18, + 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, + 15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, + 12, 12, + /* Size 16x8 */ + 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, + 32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30, + 28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, + 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17, + 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25, + 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21, + 18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14, + 13, 12, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, + 24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, + 22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20, + 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, + 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, + 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, + 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, + 22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, + 27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, + 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21, + 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24, + 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, + 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, + 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, + 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, + 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20, + 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 11, 11, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32, + 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32, + 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, + 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30, + 29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28, + 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25, + 23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21, + 21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19, + 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32, + 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31, + 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28, + 28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24, + 24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23, + 21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20, + 19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, + 17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16, + 27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27, + 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27, + 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24, + 23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20, + 19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18, + 16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16, + 15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13, + 13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13, + 19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20, + 21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20, + 20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19, + 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17, + 16, 15, 14, 14, 13, 12, 12, 11, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32, + 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28, + 27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21, + 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, + /* Size 16x4 */ + 33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30, + 27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17, + 28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22, + 17, 13, 20, 20, 16, 12, 19, 19, 15, 12, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, + 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, + 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, + 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19, + 19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, + 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24, + 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, + 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, + 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, + 12, 12, 12, 12, + /* Size 32x8 */ + 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32, + 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, + 29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25, + 23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20, + 32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, + 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27, + 24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20, + 19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17, + 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28, + 26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23, + 19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, + 15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13, + 19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20, + 20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19, + 16, 14, 13, 12 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16, + /* Size 8x8 */ + 33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28, + 25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20, + 19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17, + 17, 16, 20, 21, 21, 20, 19, 17, 16, 15, + /* Size 16x16 */ + 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32, + 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26, + 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23, + 22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22, + 22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22, + 22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20, + 20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, + 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, + 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, + 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, + 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18, + 18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17, + 16, 16, 15, 14, + /* Size 32x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30, + 30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27, + 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, + 20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, + 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33, + 33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, + 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32, + 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26, + 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, + 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23, + 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, + 30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27, + 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, + 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24, + 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, + 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, + 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21, + 21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, + 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, + /* Size 4x8 */ + 33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22, + 22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15, + /* Size 8x4 */ + 33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21, + 19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15, + /* Size 8x16 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27, + 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22, + 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19, + 18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, + 16, 15, + /* Size 16x8 */ + 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, + 27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23, + 22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, + 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, + 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23, + 23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, + 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17, + 16, 15, + /* Size 16x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29, + 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, + 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22, + 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, + 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, + 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22, + 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23, + 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, + 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, + 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, + 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20, + 20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 14, 14, + /* Size 32x16 */ + 32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32, + 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27, + 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23, + 22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22, + 23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23, + 22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22, + 22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, + 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25, + 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22, + 22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20, + 20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20, + 21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20, + 20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22, + 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22, + 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22, + 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20, + 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19, + 18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, + 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, + 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16, + 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21, + 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21, + 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, + 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19, + 19, 18, 17, 17, 16, 15, 15, 14, + /* Size 4x16 */ + 33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27, + 26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, + 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22, + 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, + /* Size 16x4 */ + 33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24, + 22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19, + 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22, + 19, 16, 21, 22, 18, 16, 20, 21, 18, 15, + /* Size 8x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, + 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, + 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, + 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 16, 15, 15, 15, + /* Size 32x8 */ + 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33, + 28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26, + 22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, + 22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22, + 30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26, + 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21, + 20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20, + 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19, + 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22, + 22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22, + 19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, + 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, + 20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21, + 21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20, + 19, 17, 16, 15 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16, + /* Size 8x8 */ + 33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32, + 31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28, + 25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18, + 16, 15, 22, 23, 23, 22, 20, 17, 15, 14, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, + 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30, + 29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, + 28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26, + 25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23, + 22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20, + 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29, + 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28, + 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26, + 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23, + 22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, + 18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17, + 16, 15, 15, 14, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, + 26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, + 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22, + 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, + 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, + 27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24, + 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, + 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24, + 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, + 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, + 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31, + 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, + 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, + 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, + 20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, + 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29, + 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29, + 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26, + 26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17, + 17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, + 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26, + 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20, + 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27, + 27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19, + 18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, + 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, + 16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24, + 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, + 23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, + 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, + 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, + 15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14, + 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19, + 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, + /* Size 4x8 */ + 33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30, + 28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15, + /* Size 8x4 */ + 33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28, + 24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32, + 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30, + 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26, + 25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, + 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, + 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, + 15, 15, + /* Size 16x8 */ + 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, + 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30, + 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, + 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20, + 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28, + 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24, + 20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18, + 15, 15, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, + 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, + 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23, + 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, + 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, + 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, + 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, + 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17, + 17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, + 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, + 15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, + 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21, + 21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16, + 15, 14, 14, 14, 14, 13, 13, 13, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33, + 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, + 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27, + 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25, + 23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21, + 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, + 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32, + 31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, + 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28, + 28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26, + 26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, + 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21, + 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19, + 29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29, + 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30, + 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26, + 26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24, + 21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20, + 20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18, + 17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, + 16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14, + 23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23, + 23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23, + 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22, + 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20, + 18, 18, 18, 16, 15, 15, 14, 13, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32, + 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30, + 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, + 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, + /* Size 16x4 */ + 33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30, + 28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21, + 29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24, + 19, 16, 23, 23, 18, 16, 22, 22, 18, 15, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, + 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25, + 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29, + 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, + 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25, + 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, + 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, + 15, 15, 15, 14, + /* Size 32x8 */ + 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32, + 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, + 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30, + 25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23, + 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, + 31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29, + 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, + 21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20, + 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30, + 28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26, + 21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, + 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16, + 23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, + 23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21, + 18, 18, 15, 14 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17, + /* Size 8x8 */ + 33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29, + 26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22, + 21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19, + 18, 17, 21, 22, 22, 22, 20, 19, 17, 17, + /* Size 16x16 */ + 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33, + 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33, + 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29, + 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24, + 23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22, + 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22, + 22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22, + 22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, + 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22, + 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, + 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22, + 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22, + 21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, + 19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, + 18, 17, 17, 17, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, + 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, + 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33, + 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32, + 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23, + 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, + 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, + 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, + 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30, + 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27, + 25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26, + 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, + 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, + 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + /* Size 4x8 */ + 33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17, + /* Size 8x4 */ + 33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22, + 20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17, + /* Size 8x16 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33, + 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29, + 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, + 17, 17, + /* Size 16x8 */ + 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, + 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, + 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22, + 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20, + 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, + 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22, + 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, + 17, 17, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31, + 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, + 23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, + 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, + 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27, + 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, + 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, + 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, + 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, + 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 16, 16, + /* Size 32x16 */ + 32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33, + 33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, + 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27, + 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26, + 23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, + 22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23, + 23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23, + 22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, + 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29, + 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, + 25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, + 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, + 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, + 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, + 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, + 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21, + 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19, + 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19, + 18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, + 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21, + 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, + 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, + 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21, + 20, 19, 19, 18, 17, 17, 17, 16, + /* Size 4x16 */ + 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27, + 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, + 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, + /* Size 16x4 */ + 33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24, + 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21, + 22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, + 19, 18, 21, 22, 19, 17, 21, 22, 19, 17, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, + 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, + 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 17, 17, + /* Size 32x8 */ + 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33, + 30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26, + 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22, + 23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22, + 31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27, + 25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22, + 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, + 21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20, + 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, + 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22, + 20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, + 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17, + 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, + 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, + 20, 19, 17, 17 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19, + /* Size 8x8 */ + 33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, + 32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29, + 28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22, + 20, 19, 25, 26, 26, 25, 23, 21, 19, 18, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, + 28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, + 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, + 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30, + 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30, + 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, + 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26, + 26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, + 23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19, + 19, 18, 18, 16, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, + 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, + 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, + 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, + 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, + 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, + 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, + 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31, + 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, + 23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, + 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21, + 20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, + 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, + 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, + 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, + 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20, + 20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, + 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, + 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27, + 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, + 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19, + 18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, + 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, + 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, + 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, + /* Size 4x8 */ + 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30, + 30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18, + /* Size 8x4 */ + 33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30, + 27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30, + 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, + 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, + 18, 16, + /* Size 16x8 */ + 32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, + 32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, + 31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, + 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, + 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30, + 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26, + 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19, + 19, 16, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, + 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, + 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29, + 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, + 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, + 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, + 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, + 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, + 19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, + 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24, + 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, + 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25, + 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, + 19, 18, 18, 18, 18, 17, 16, 16, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, + 29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, + 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25, + 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32, + 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, + 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30, + 30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, + 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, + 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26, + 24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, + 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, + 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30, + 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28, + 28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27, + 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, + 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21, + 21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20, + 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, + 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27, + 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26, + 26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24, + 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24, + 24, 21, 19, 19, 19, 18, 16, 16, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31, + 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26, + 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, + /* Size 16x4 */ + 33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32, + 31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26, + 30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26, + 23, 19, 27, 26, 23, 19, 24, 24, 21, 18, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, + 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29, + 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, + 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, + 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, + 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 16, 16, + /* Size 32x8 */ + 32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32, + 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, + 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30, + 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, + 33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, + 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, + 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, + 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23, + 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, + 30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27, + 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, + 21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18, + 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26, + 26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24, + 24, 19, 19, 16 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19, + /* Size 8x8 */ + 33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31, + 29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23, + 22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20, + 19, 19, 21, 22, 23, 22, 22, 20, 19, 18, + /* Size 16x16 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33, + 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33, + 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29, + 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26, + 26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, + 23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, + 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, + 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, + 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24, + 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, + 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, + 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19, + 19, 19, 19, 18, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, + 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, + 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, + 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, + 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, + 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, + 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23, + 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, + 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, + 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + /* Size 4x8 */ + 33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24, + 23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, + /* Size 8x4 */ + 33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23, + 21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19, + /* Size 8x16 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33, + 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, + 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, + 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, + 18, 18, + /* Size 16x8 */ + 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, + 33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, + 26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, + 22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, + 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22, + 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22, + 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, + 19, 18, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, + 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, + 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, + 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21, + 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, + 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, + 19, 19, 18, 18, 18, 18, 18, 18, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33, + 33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33, + 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, + 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27, + 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24, + 22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, + 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, + 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23, + 32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30, + 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, + 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23, + 23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, + 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22, + 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, + 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24, + 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21, + 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, + 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, + 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, + 23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23, + 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22, + 22, 21, 19, 19, 19, 18, 18, 18, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30, + 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22, + 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, + /* Size 16x4 */ + 33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29, + 24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22, + 24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22, + 20, 19, 22, 22, 20, 19, 22, 23, 21, 18, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 18, + /* Size 32x8 */ + 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33, + 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, + 27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22, + 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, + 32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, + 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23, + 23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, + 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22, + 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, + 24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21, + 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, + 19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18, + 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, + 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, + 22, 19, 19, 18 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22, + /* Size 8x8 */ + 33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, + 32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30, + 29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27, + 26, 24, 29, 29, 30, 28, 27, 26, 24, 21, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, + 29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31, + 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31, + 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, + 28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, + 27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, + 24, 23, 21, 21, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26, + 26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, + 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30, + 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, + 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, + 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, + 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, + 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29, + 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, + 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, + 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, + 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21, + 21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20, + /* Size 4x8 */ + 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32, + 31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21, + /* Size 8x4 */ + 33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31, + 29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30, + 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, + 29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23, + 21, 21, + /* Size 16x8 */ + 32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, + 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, + 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30, + 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, + 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31, + 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, + 28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, + 22, 21, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, + 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, + 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22, + 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, + 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, + 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, + 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, + 24, 24, 23, 22, 21, 21, 21, 21, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, + 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, + 28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27, + 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, + 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31, + 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31, + 30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, + 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, + 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23, + 29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29, + 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, + 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30, + 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27, + 26, 26, 26, 24, 22, 21, 21, 21, + /* Size 4x16 */ + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30, + 28, 28, 28, 27, 27, 25, 24, 23, 21, 21, + /* Size 16x4 */ + 33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32, + 31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28, + 32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30, + 28, 23, 29, 30, 27, 21, 29, 30, 27, 21, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29, + 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, + 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, + 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, + 21, 21, 21, 21, + /* Size 32x8 */ + 32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32, + 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, + 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, + 30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, + 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, + 32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31, + 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, + 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27, + 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, + 31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29, + 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, + 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23, + 29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, + 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28, + 26, 26, 22, 21 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20, + /* Size 8x8 */ + 33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33, + 32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24, + 22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21, + 21, 20, 21, 22, 22, 22, 22, 21, 20, 19, + /* Size 16x16 */ + 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33, + 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33, + 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33, + 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, + 28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, + 26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23, + 23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22, + 22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22, + 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, + 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25, + 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24, + 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 19, 19, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28, + 28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, + 23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, + 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, + 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, + 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, + 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25, + 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, + 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, + 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, + 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, + 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, + 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, + 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, + /* Size 4x8 */ + 33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27, + 26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19, + /* Size 8x4 */ + 33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26, + 22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19, + /* Size 8x16 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33, + 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32, + 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29, + 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, + 22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, + /* Size 16x8 */ + 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, + 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, + 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24, + 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, + 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24, + 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, + 22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, + 20, 19, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, + 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, + 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, + 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, + 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, + 23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, + 27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, + 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24, + 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, + 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 20, 20, 19, 19, 19, 19, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33, + 33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, + 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, + 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, + 27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25, + 23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22, + 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, + 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33, + 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31, + 31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29, + 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22, + 22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, + 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, + 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25, + 24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 19, 19, 19, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, + 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26, + 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + /* Size 16x4 */ + 33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32, + 26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22, + 27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23, + 22, 20, 21, 22, 21, 19, 21, 22, 21, 19, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, + 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, + 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, + 19, 19, 19, 19, + /* Size 32x8 */ + 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33, + 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, + 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, + 23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, + 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31, + 31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27, + 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, + 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22, + 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, + 26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23, + 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22, + 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20, + 22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, + 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, + 22, 22, 20, 19 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29, + /* Size 8x8 */ + 33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, + 31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30, + 29, 28, 31, 31, 31, 31, 29, 29, 28, 27, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, + 28, 28, 27, 26, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, + 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, + 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, + 26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, + /* Size 4x8 */ + 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28, + /* Size 8x4 */ + 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, + 31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, + 28, 27, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, + 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30, + 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, + 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, + 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, + 28, 27, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 28, 28, 28, 27, 27, 26, 26, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, + 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, + 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, + 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, + 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, + 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30, + 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30, + 29, 28, 28, 28, 28, 28, 27, 26, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, + /* Size 16x4 */ + 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30, + 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31, + 30, 29, 31, 31, 29, 28, 30, 30, 28, 28, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 27, 27, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, + 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, + 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, + 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, + 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31, + 31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, + 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, + 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, + 31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29, + 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, + 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31, + 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30, + 29, 28, 28, 27 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22, + /* Size 8x8 */ + 33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, + 33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29, + 26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23, + 22, 22, 26, 25, 25, 24, 23, 23, 22, 21, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, + 29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, + 26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, + 25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23, + 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30, + 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28, + 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, + 24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, + 23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, + 22, 22, 21, 21, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, + 28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25, + 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, + 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, + 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27, + 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26, + 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, + 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, + 29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, + 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, + 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, + 28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, + 25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, + 24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, + 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, + 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, + 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, + 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, + 24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26, + 25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, + 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25, + 25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, + /* Size 4x8 */ + 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28, + 28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22, + /* Size 8x4 */ + 33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28, + 26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32, + 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, + 24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, + 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, + 22, 21, + /* Size 16x8 */ + 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33, + 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, + 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, + 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24, + 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28, + 27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, + 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22, + 22, 21, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, + 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, + 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, + 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, + 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25, + 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27, + 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, + 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 21, 21, 21, 21, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33, + 33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, + 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, + 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, + 26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, + 34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33, + 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, + 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, + 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31, + 29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27, + 26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25, + 25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24, + 24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, + 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, + 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29, + 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28, + 28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26, + 25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, + 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, + 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, + 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, + 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26, + 26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24, + 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33, + 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28, + 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, + /* Size 16x4 */ + 33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32, + 28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24, + 30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26, + 23, 22, 26, 25, 23, 22, 24, 24, 22, 22, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, + 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, + 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28, + 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, + /* Size 32x8 */ + 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33, + 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, + 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, + 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26, + 34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, + 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, + 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25, + 25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23, + 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, + 28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27, + 25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, + 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, + 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25, + 24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24, + 23, 22, 22, 21 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + /* Size 8x4 */ + 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, + 31, 30, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 30, 30, 30, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, + /* Size 16x4 */ + 33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, + 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, + 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, + 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, + 32, 32, 31, 30 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26, + /* Size 8x8 */ + 33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33, + 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33, + 32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28, + 26, 26, 31, 30, 30, 29, 29, 28, 26, 26, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, + 30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, + 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, + 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, + 27, 26, 26, 26, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33, + 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30, + 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, + 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, + 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, + 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29, + 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, + 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, + 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, + 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, + 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28, + 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, + 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, + 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, + 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, + 26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33, + 33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26, + /* Size 8x4 */ + 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, + 32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25, + 25, 25, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, + 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, + 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, + 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27, + 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32, + 31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28, + 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, + 27, 25, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, + 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, + 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, + 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, + 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, + 24, 24, 24, 24, 24, 24, 24, 23, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, + 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, + 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, + 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, + 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31, + 31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31, + 31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, + 28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, + 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, + 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, + 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, + 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, + 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28, + 28, 28, 28, 27, 26, 26, 24, 23, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29, + 28, 28, 28, 28, 28, 28, 26, 26, 26, 26, + /* Size 16x4 */ + 33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33, + 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28, + 33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28, + 28, 26, 30, 28, 28, 26, 30, 28, 28, 26, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, + 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, + 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25, + 25, 25, 25, 24, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33, + 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, + 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, + 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, + 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, + 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32, + 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, + 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, + 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, + 32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31, + 31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29, + 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, + 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, + 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28, + 28, 28, 26, 24 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, + 32, 32, 33, 33, 32, 32, 34, 33, 32, 32, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, + 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, + 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, + 32, 32, 32, 32 }, + }, +}; \ No newline at end of file diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h new file mode 100644 index 0000000000..8f36eb105b --- /dev/null +++ b/third_party/aom/av1/common/quant_common.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ +#define AOM_AV1_COMMON_QUANT_COMMON_H_ + +#include +#include "aom/aom_codec.h" +#include "av1/common/seg_common.h" +#include "av1/common/enums.h" +#include "av1/common/entropy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 255 +#define QINDEX_RANGE (MAXQ - MINQ + 1) +#define QINDEX_BITS 8 +// Total number of QM sets stored +#define QM_LEVEL_BITS 4 +#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS) +/* Range of QMS is between first and last value, with offset applied to inter + * blocks*/ +#define DEFAULT_QM_Y 10 +#define DEFAULT_QM_U 11 +#define DEFAULT_QM_V 12 +#define DEFAULT_QM_FIRST 5 +#define DEFAULT_QM_LAST 9 +#define LOSSLESS_Q_STEP 4 // this should equal to dc/ac_qlookup_QTX[0] + +struct AV1Common; +struct CommonQuantParams; +struct macroblockd; + +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); + +int av1_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +// Returns true if we are using quantization matrix. +bool av1_use_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id); + +// Reduce the large number of quantizers to a smaller number of levels for which +// different matrices may be defined +static INLINE int aom_get_qmlevel(int qindex, int first, int last) { + return first + (qindex * (last + 1 - first)) / QINDEX_RANGE; +} + +// Initialize all global quant/dequant matrices. +void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes); + +// Get global dequant matrix. +const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); +// Get global quant matrix. +const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); + +// Get either local / global dequant matrix as appropriate. +const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); +// Get either local / global quant matrix as appropriate. +const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_QUANT_COMMON_H_ diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c new file mode 100644 index 0000000000..602fab7237 --- /dev/null +++ b/third_party/aom/av1/common/reconinter.c @@ -0,0 +1,1169 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_ports/aom_once.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +// This function will determine whether or not to create a warped +// prediction. +static int allow_warp(const MB_MODE_INFO *const mbmi, + const WarpTypesAllowed *const warp_types, + const WarpedMotionParams *const gm_params, + int build_for_obmc, const struct scale_factors *const sf, + WarpedMotionParams *final_warp_params) { + // Note: As per the spec, we must test the fixed point scales here, which are + // at a higher precision (1 << 14) than the xs and ys in subpel_params (that + // have 1 << 10 precision). + if (av1_is_scaled(sf)) return 0; + + if (final_warp_params != NULL) *final_warp_params = default_warp_params; + + if (build_for_obmc) return 0; + + if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) { + if (final_warp_params != NULL) + memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params)); + return 1; + } else if (warp_types->global_warp_allowed && !gm_params->invalid) { + if (final_warp_params != NULL) + memcpy(final_warp_params, gm_params, sizeof(*final_warp_params)); + return 1; + } + + return 0; +} + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi) { + if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8) + return; + + if (xd->cur_frame_force_integer_mv) return; + + if (allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0, + inter_pred_params->scale_factors, + &inter_pred_params->warp_params)) { + inter_pred_params->mode = WARP_PRED; + } +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + assert(IMPLIES(inter_pred_params->conv_params.is_compound, + inter_pred_params->conv_params.dst != NULL)); + + if (inter_pred_params->mode == TRANSLATION_PRED) { +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->block_width, + inter_pred_params->block_height, + &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params, + inter_pred_params->bit_depth); + } else { + inter_predictor(src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->block_width, + inter_pred_params->block_height, + &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); + } +#else + inter_predictor(src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->block_width, + inter_pred_params->block_height, + &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); +#endif + } + // TODO(jingning): av1_warp_plane() can be further cleaned up. + else if (inter_pred_params->mode == WARP_PRED) { + av1_warp_plane( + &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf, + inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0, + inter_pred_params->ref_frame_buf.width, + inter_pred_params->ref_frame_buf.height, + inter_pred_params->ref_frame_buf.stride, dst, + inter_pred_params->pix_col, inter_pred_params->pix_row, + inter_pred_params->block_width, inter_pred_params->block_height, + dst_stride, inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y, &inter_pred_params->conv_params); + } else { + assert(0 && "Unsupported inter_pred_params->mode"); + } +} + +static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, + 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; +static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27, + 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; +static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21, + 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; + +static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift, + int width) { + if (shift >= 0) { + memcpy(dst + shift, src, width - shift); + memset(dst, src[0], shift); + } else { + shift = -shift; + memcpy(dst, src + shift, width - shift); + memset(dst + width - shift, src[width - 1], shift); + } +} + +/* clang-format off */ +DECLARE_ALIGNED(16, static uint8_t, + wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used +}; +/* clang-format on */ + +// [negative][direction] +DECLARE_ALIGNED( + 16, static uint8_t, + wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]); + +// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound +// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE. +DECLARE_ALIGNED(16, static uint8_t, + wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]); + +DECLARE_ALIGNED(16, static uint8_t, + smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL] + [MAX_WEDGE_SQUARE]); + +static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2]; + +static const wedge_code_type wedge_codebook_16_hgtw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_hltw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_heqw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = { + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], + wedge_masks[BLOCK_8X8] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], + wedge_masks[BLOCK_8X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], + wedge_masks[BLOCK_16X8] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], + wedge_masks[BLOCK_16X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], + wedge_masks[BLOCK_16X32] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], + wedge_masks[BLOCK_32X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], + wedge_masks[BLOCK_32X32] }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], + wedge_masks[BLOCK_8X32] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], + wedge_masks[BLOCK_32X8] }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, +}; + +static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, + BLOCK_SIZE sb_type) { + const uint8_t *master; + const int bh = block_size_high[sb_type]; + const int bw = block_size_wide[sb_type]; + const wedge_code_type *a = + av1_wedge_params_lookup[sb_type].codebook + wedge_index; + int woff, hoff; + const uint8_t wsignflip = + av1_wedge_params_lookup[sb_type].signflip[wedge_index]; + + assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type)); + woff = (a->x_offset * bw) >> 3; + hoff = (a->y_offset * bh) >> 3; + master = wedge_mask_obl[neg ^ wsignflip][a->direction] + + MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) + + MASK_MASTER_SIZE / 2 - woff; + return master; +} + +const uint8_t *av1_get_compound_type_mask( + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) { + (void)sb_type; + switch (comp_data->type) { + case COMPOUND_WEDGE: + return av1_get_contiguous_soft_mask(comp_data->wedge_index, + comp_data->wedge_sign, sb_type); + default: return comp_data->seg_mask; + } +} + +static AOM_INLINE void diffwtd_mask_d16( + uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + int i, j, m, diff; + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]); + diff = ROUND_POWER_OF_TWO(diff, round); + m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); + mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; + } + } +} + +void av1_build_compound_diffwtd_mask_d16_c( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w, + conv_params, bd); + break; + case DIFFWTD_38_INV: + diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w, + conv_params, bd); + break; + default: assert(0); + } +} + +static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse, + int mask_base, const uint8_t *src0, + int src0_stride, const uint8_t *src1, + int src1_stride, int h, int w) { + int i, j, m, diff; + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + diff = + abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]); + m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); + mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; + } + } +} + +void av1_build_compound_diffwtd_mask_c(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w); + break; + case DIFFWTD_38_INV: + diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w); + break; + default: assert(0); + } +} + +static AOM_FORCE_INLINE void diffwtd_mask_highbd( + uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0, + int src0_stride, const uint16_t *src1, int src1_stride, int h, int w, + const unsigned int bd) { + assert(bd >= 8); + if (bd == 8) { + if (which_inverse) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } + } else { + const unsigned int bd_shift = bd - 8; + if (which_inverse) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = + (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = + (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } + } +} + +void av1_build_compound_diffwtd_mask_highbd_c( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, + CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); + break; + case DIFFWTD_38_INV: + diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, + CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); + break; + default: assert(0); + } +} + +static AOM_INLINE void init_wedge_master_masks(void) { + int i, j; + const int w = MASK_MASTER_SIZE; + const int h = MASK_MASTER_SIZE; + const int stride = MASK_MASTER_STRIDE; + // Note: index [0] stores the masters, and [1] its complement. + // Generate prototype by shifting the masters + int shift = h / 4; + for (i = 0; i < h; i += 2) { + shift_copy(wedge_master_oblique_even, + &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift, + MASK_MASTER_SIZE); + shift--; + shift_copy(wedge_master_oblique_odd, + &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift, + MASK_MASTER_SIZE); + memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride], + wedge_master_vertical, + MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); + memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride], + wedge_master_vertical, + MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); + } + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j]; + wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk; + wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = + wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - msk; + wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] = + wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - msk; + wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = + wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk; + const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j]; + wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx; + wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] = + wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - mskx; + } + } +} + +static AOM_INLINE void init_wedge_masks(void) { + uint8_t *dst = wedge_mask_buf; + BLOCK_SIZE bsize; + memset(wedge_masks, 0, sizeof(wedge_masks)); + for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) { + const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize]; + const int wtypes = wedge_params->wedge_types; + if (wtypes == 0) continue; + const uint8_t *mask; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int w; + for (w = 0; w < wtypes; ++w) { + mask = get_wedge_mask_inplace(w, 0, bsize); + aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw, + bh); + wedge_params->masks[0][w] = dst; + dst += bw * bh; + + mask = get_wedge_mask_inplace(w, 1, bsize); + aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw, + bh); + wedge_params->masks[1][w] = dst; + dst += bw * bh; + } + assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf)); + } +} + +/* clang-format off */ +static const uint8_t ii_weights1d[MAX_SB_SIZE] = { + 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, + 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; +static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = { + 32, 16, 16, 16, 8, 8, 8, 4, + 4, 4, 2, 2, 2, 1, 1, 1, + 8, 8, 4, 4, 2, 2 +}; +/* clang-format on */ + +static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride, + BLOCK_SIZE plane_bsize, + INTERINTRA_MODE mode) { + int i, j; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const int size_scale = ii_size_scales[plane_bsize]; + + switch (mode) { + case II_V_PRED: + for (i = 0; i < bh; ++i) { + memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0])); + mask += stride; + } + break; + + case II_H_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale]; + mask += stride; + } + break; + + case II_SMOOTH_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) + mask[j] = ii_weights1d[(i < j ? i : j) * size_scale]; + mask += stride; + } + break; + + case II_DC_PRED: + default: + for (i = 0; i < bh; ++i) { + memset(mask, 32, bw * sizeof(mask[0])); + mask += stride; + } + break; + } +} + +static AOM_INLINE void init_smooth_interintra_masks(void) { + for (int m = 0; m < INTERINTRA_MODES; ++m) { + for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) { + const int bw = block_size_wide[bs]; + const int bh = block_size_high[bs]; + if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue; + build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs, + m); + } + } +} + +// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0 +static void init_all_wedge_masks(void) { + init_wedge_master_masks(); + init_wedge_masks(); + init_smooth_interintra_masks(); +} + +void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); } + +static AOM_INLINE void build_masked_compound_no_round( + uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, InterPredParams *inter_pred_params) { + const int ssy = inter_pred_params->subsampling_y; + const int ssx = inter_pred_params->subsampling_x; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + const int mask_stride = block_size_wide[sb_type]; +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, + ssy, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } else { + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); + } +#else + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); +#endif +} + +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp; + BLOCK_SIZE sb_type = inter_pred_params->sb_type; + + // We're going to call av1_make_inter_predictor to generate a prediction into + // a temporary buffer, then will blend that temporary buffer with that from + // the other reference. + DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]); + uint8_t *tmp_dst = + inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf; + + const int tmp_buf_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst; + int org_dst_stride = inter_pred_params->conv_params.dst_stride; + CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf; + inter_pred_params->conv_params.dst = tmp_buf16; + inter_pred_params->conv_params.dst_stride = tmp_buf_stride; + assert(inter_pred_params->conv_params.do_average == 0); + + // This will generate a prediction in tmp_buf for the second reference + av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, + inter_pred_params, subpel_params); + + if (!inter_pred_params->conv_params.plane && + comp_data->type == COMPOUND_DIFFWTD) { + av1_build_compound_diffwtd_mask_d16( + comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride, + tmp_buf16, tmp_buf_stride, inter_pred_params->block_height, + inter_pred_params->block_width, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } + build_masked_compound_no_round( + dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride, + comp_data, sb_type, inter_pred_params->block_height, + inter_pred_params->block_width, inter_pred_params); +} + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int *fwd_offset, + int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound) { + assert(fwd_offset != NULL && bck_offset != NULL); + if (!is_compound || mbmi->compound_idx) { + *fwd_offset = 8; + *bck_offset = 8; + *use_dist_wtd_comp_avg = 0; + return; + } + + *use_dist_wtd_comp_avg = 1; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + const int cur_frame_index = cm->cur_frame->order_hint; + int bck_frame_index = 0, fwd_frame_index = 0; + + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; + + int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, + fwd_frame_index, cur_frame_index)), + 0, MAX_FRAME_DISTANCE); + int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, bck_frame_index)), + 0, MAX_FRAME_DISTANCE); + + const int order = d0 <= d1; + + if (d0 == 0 || d1 == 0) { + *fwd_offset = quant_dist_lookup_table[3][order]; + *bck_offset = quant_dist_lookup_table[3][1 - order]; + return; + } + + int i; + for (i = 0; i < 3; ++i) { + int c0 = quant_dist_weight[i][order]; + int c1 = quant_dist_weight[i][!order]; + int d0_c0 = d0 * c0; + int d1_c1 = d1 * c1; + if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break; + } + + *fwd_offset = quant_dist_lookup_table[i][order]; + *bck_offset = quant_dist_lookup_table[i][1 - order]; +} + +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const int plane_start, const int plane_end) { + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) { + struct macroblockd_plane *const pd = &planes[i]; + const int is_uv = i > 0; + setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv], + src->crop_heights[is_uv], src->strides[is_uv], mi_row, + mi_col, NULL, pd->subsampling_x, pd->subsampling_y); + } +} + +void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf, + const int num_planes) { + if (src != NULL) { + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const int is_uv = i > 0; + setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i], + src->crop_widths[is_uv], src->crop_heights[is_uv], + src->strides[is_uv], mi_row, mi_col, sf, + pd->subsampling_x, pd->subsampling_y); + } + } +} + +// obmc_mask_N[overlap_position] +static const uint8_t obmc_mask_1[1] = { 64 }; +DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 }; + +DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 }; + +static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 }; + +static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54, + 56, 58, 60, 61, 64, 64, 64, 64 }; + +static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44, + 45, 47, 48, 50, 51, 52, 53, 55, + 56, 57, 58, 59, 60, 60, 61, 62, + 64, 64, 64, 64, 64, 64, 64, 64 }; + +static const uint8_t obmc_mask_64[64] = { + 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44, + 45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56, + 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62, + 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; + +const uint8_t *av1_get_obmc_mask(int length) { + switch (length) { + case 1: return obmc_mask_1; + case 2: return obmc_mask_2; + case 4: return obmc_mask_4; + case 8: return obmc_mask_8; + case 16: return obmc_mask_16; + case 32: return obmc_mask_32; + case 64: return obmc_mask_64; + default: assert(0); return NULL; + } +} + +static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *mi, void *fun_ctxt, + const int num_planes) { + (void)xd; + (void)rel_mi_row; + (void)rel_mi_col; + (void)op_mi_size; + (void)dir; + (void)mi; + ++*(uint8_t *)fun_ctxt; + (void)num_planes; +} + +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + + mbmi->overlappable_neighbors = 0; + + if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return; + + foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr, + &mbmi->overlappable_neighbors); + if (mbmi->overlappable_neighbors) return; + foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr, + &mbmi->overlappable_neighbors); +} + +// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if +// block-size of current plane is smaller than 8x8, always only blend with the +// left neighbor(s) (skip blending with the above side). +#define DISABLE_CHROMA_U8X8_OBMC 0 // 0: one-sided obmc; 1: disable + +int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd, int dir) { + assert(is_motion_variation_allowed_bsize(bsize)); + + const BLOCK_SIZE bsize_plane = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + switch (bsize_plane) { +#if DISABLE_CHROMA_U8X8_OBMC + case BLOCK_4X4: + case BLOCK_8X4: + case BLOCK_4X8: return 1; +#else + case BLOCK_4X4: + case BLOCK_8X4: + case BLOCK_4X8: return dir == 0; +#endif + default: return 0; + } +} + +void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) { + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; +} + +struct obmc_inter_pred_ctxt { + uint8_t **adjacent; + int *adjacent_stride; +}; + +static INLINE void build_obmc_inter_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) { + (void)above_mi; + (void)rel_mi_row; + (void)dir; + struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int overlap = + AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + const int bh = overlap >> pd->subsampling_y; + const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + + const int dst_stride = pd->dst.stride; + uint8_t *const dst = &pd->dst.buf[plane_col]; + const int tmp_stride = ctxt->adjacent_stride[plane]; + const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col]; + const uint8_t *const mask = av1_get_obmc_mask(bh); +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) + aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, + tmp_stride, mask, bw, bh, xd->bd); + else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, + mask, bw, bh); +#else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif + } +} + +static INLINE void build_obmc_inter_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) { + (void)left_mi; + (void)rel_mi_col; + (void)dir; + struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int overlap = + AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = overlap >> pd->subsampling_x; + const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y; + const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + + const int dst_stride = pd->dst.stride; + uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride]; + const int tmp_stride = ctxt->adjacent_stride[plane]; + const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride]; + const uint8_t *const mask = av1_get_obmc_mask(bw); + +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) + aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, + tmp_stride, mask, bw, bh, xd->bd); + else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, + mask, bw, bh); +#else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif + } +} + +// This function combines motion compensated predictions that are generated by +// top/left neighboring blocks' inter predictors with the regular inter +// prediction. We assume the original prediction (bmc) is stored in +// xd->plane[].dst.buf +void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *above[MAX_MB_PLANE], + int above_stride[MAX_MB_PLANE], + uint8_t *left[MAX_MB_PLANE], + int left_stride[MAX_MB_PLANE]) { + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + + // handle above row + struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride }; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_obmc_inter_pred_above, &ctxt_above); + + // handle left column + struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride }; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_obmc_inter_pred_left, &ctxt_left); +} + +void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, + uint8_t **dst_buf2) { + if (is_cur_buf_hbd(xd)) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); + } else { + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; + } +} + +void av1_setup_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize); + const int above_mi_col = xd->mi_col + rel_mi_col; + + av1_modify_neighbor_predictor_for_obmc(above_mbmi); + + for (int j = 0; j < num_planes; ++j) { + struct macroblockd_plane *const pd = &xd->plane[j]; + setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], + ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const int num_refs = 1 + has_second_ref(above_mbmi); + + for (int ref = 0; ref < num_refs; ++ref) { + const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + xd->block_ref_scale_factors[ref] = sf; + if ((!av1_is_valid_scale(sf))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf, + num_planes); + } + + xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); + xd->mb_to_right_edge = + ctxt->mb_to_far_edge + + (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8; +} + +void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, + uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize); + const int left_mi_row = xd->mi_row + rel_mi_row; + + av1_modify_neighbor_predictor_for_obmc(left_mbmi); + + for (int j = 0; j < num_planes; ++j) { + struct macroblockd_plane *const pd = &xd->plane[j]; + setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], + ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const int num_refs = 1 + has_second_ref(left_mbmi); + + for (int ref = 0; ref < num_refs; ++ref) { + const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const ref_scale_factors = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[ref] = ref_scale_factors; + if ((!av1_is_valid_scale(ref_scale_factors))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col, + ref_scale_factors, num_planes); + } + + xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row)); + xd->mb_to_bottom_edge = + ctxt->mb_to_far_edge + + GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE); +} + +static AOM_INLINE void combine_interintra( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred, int compstride, const uint8_t *interpred, + int interstride, const uint8_t *intrapred, int intrastride) { + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + if (use_wedge_interintra) { + if (av1_is_wedge_used(bsize)) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + const int subw = 2 * mi_size_wide[bsize] == bw; + const int subh = 2 * mi_size_high[bsize] == bh; + aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, + interpred, interstride, mask, block_size_wide[bsize], + bw, bh, subw, subh); + } + return; + } + + const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize]; + aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred, + interstride, mask, bw, bw, bh, 0, 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void combine_interintra_highbd( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred8, int compstride, const uint8_t *interpred8, + int interstride, const uint8_t *intrapred8, int intrastride, int bd) { + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + if (use_wedge_interintra) { + if (av1_is_wedge_used(bsize)) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + const int subh = 2 * mi_size_high[bsize] == bh; + const int subw = 2 * mi_size_wide[bsize] == bw; + aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, + interpred8, interstride, mask, + block_size_wide[bsize], bw, bh, subw, subh, bd); + } + return; + } + + uint8_t mask[MAX_SB_SQUARE]; + build_smooth_interintra_mask(mask, bw, plane_bsize, mode); + aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, + interpred8, interstride, mask, bw, bw, bh, 0, 0, + bd); +} +#endif + +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); + PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode]; + assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0); + assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0); + assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0); + assert(xd->mi[0]->use_intrabc == 0); + const SequenceHeader *seq_params = cm->seq_params; + + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, pd->width, + pd->height, max_txsize_rect_lookup[plane_bsize], mode, + 0, 0, FILTER_INTRA_MODES, ctx->plane[plane], + ctx->stride[plane], dst, dst_stride, 0, 0, plane); +} + +void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride) { + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + combine_interintra_highbd( + xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride, xd->bd); + return; + } +#endif + combine_interintra( + xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride); +} + +// build interintra_predictors for one plane +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); + av1_build_intra_predictors_for_interintra( + cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), + MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, + CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); + } else { + DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]); + av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx, + intrapredictor, MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor, + MAX_SB_SIZE); + } +} diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h new file mode 100644 index 0000000000..c31f4531e2 --- /dev/null +++ b/third_party/aom/av1/common/reconinter.h @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RECONINTER_H_ +#define AOM_AV1_COMMON_RECONINTER_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/warped_motion.h" +#include "aom/aom_integer.h" + +// Work out how many pixels off the edge of a reference frame we're allowed +// to go when forming an inter prediction. +// The outermost row/col of each referernce frame is extended by +// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep +// at least AOM_INTERP_EXTEND pixels within that to account for filtering. +// +// We have to break this up into two macros to keep both clang-format and +// tools/lint-hunks.py happy. +#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \ + ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND) +#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \ + (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS) + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_WEDGE_TYPES 16 + +#define MAX_WEDGE_SIZE_LOG2 5 // 32x32 +#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2) +#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE) + +#define WEDGE_WEIGHT_BITS 6 + +#define WEDGE_NONE -1 + +// Angles are with respect to horizontal anti-clockwise +enum { + WEDGE_HORIZONTAL = 0, + WEDGE_VERTICAL = 1, + WEDGE_OBLIQUE27 = 2, + WEDGE_OBLIQUE63 = 3, + WEDGE_OBLIQUE117 = 4, + WEDGE_OBLIQUE153 = 5, + WEDGE_DIRECTIONS +} UENUM1BYTE(WedgeDirectionType); + +// 3-tuple: {direction, x_offset, y_offset} +typedef struct { + WedgeDirectionType direction; + int x_offset; + int y_offset; +} wedge_code_type; + +typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES]; + +typedef struct { + int wedge_types; + const wedge_code_type *codebook; + uint8_t *signflip; + wedge_masks_type *masks; +} wedge_params_type; + +extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL]; + +typedef struct SubpelParams { + int xs; + int ys; + int subpel_x; + int subpel_y; + int pos_x; + int pos_y; +} SubpelParams; + +struct build_prediction_ctxt { + const AV1_COMMON *cm; + uint8_t **tmp_buf; + int *tmp_width; + int *tmp_height; + int *tmp_stride; + int mb_to_far_edge; + void *dcb; // Decoder-only coding block. +}; + +typedef enum InterPredMode { + TRANSLATION_PRED, + WARP_PRED, +} InterPredMode; + +typedef enum InterCompMode { + UNIFORM_SINGLE, + UNIFORM_COMP, + MASK_COMP, +} InterCompMode; + +typedef struct InterPredParams { + InterPredMode mode; + InterCompMode comp_mode; + WarpedMotionParams warp_params; + ConvolveParams conv_params; + const InterpFilterParams *interp_filter_params[2]; + int block_width; + int block_height; + int pix_row; + int pix_col; + struct buf_2d ref_frame_buf; + int subsampling_x; + int subsampling_y; + const struct scale_factors *scale_factors; + int bit_depth; + int use_hbd_buf; + INTERINTER_COMPOUND_DATA mask_comp; + BLOCK_SIZE sb_type; + int is_intrabc; + int top; + int left; +} InterPredParams; + +// Initialize sub-pel params required for inter prediction. +static AOM_INLINE void init_subpel_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + SubpelParams *subpel_params, int width, int height) { + const struct scale_factors *sf = inter_pred_params->scale_factors; + int ssx = inter_pred_params->subsampling_x; + int ssy = inter_pred_params->subsampling_y; + int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + orig_pos_y += src_mv->row * (1 << (1 - ssy)); + int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + orig_pos_x += src_mv->col * (1 << (1 - ssx)); + const int is_scaled = av1_is_scaled(sf); + int pos_x, pos_y; + if (LIKELY(!is_scaled)) { + pos_y = av1_unscaled_value(orig_pos_y, sf); + pos_x = av1_unscaled_value(orig_pos_x, sf); + } else { + pos_y = av1_scaled_y(orig_pos_y, sf); + pos_x = av1_scaled_x(orig_pos_x, sf); + } + + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int bottom = (height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + const int right = (width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, inter_pred_params->top, bottom); + pos_x = clamp(pos_x, inter_pred_params->left, right); + + subpel_params->pos_x = pos_x; + subpel_params->pos_y = pos_y; + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; +} + +// Initialize interp filter required for inter prediction. +static AOM_INLINE void init_interp_filter_params( + const InterpFilterParams *interp_filter_params[2], + const InterpFilters *filter, int block_width, int block_height, + int is_intrabc) { + if (UNLIKELY(is_intrabc)) { + interp_filter_params[0] = &av1_intrabc_filter_params; + interp_filter_params[1] = &av1_intrabc_filter_params; + } else { + interp_filter_params[0] = av1_get_interp_filter_params_with_block_size( + (InterpFilter)filter->x_filter, block_width); + interp_filter_params[1] = av1_get_interp_filter_params_with_block_size( + (InterpFilter)filter->y_filter, block_height); + } +} + +// Initialize parameters required for inter prediction at mode level. +static AOM_INLINE void init_inter_mode_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + SubpelParams *subpel_params, const struct scale_factors *sf, int width, + int height) { + inter_pred_params->scale_factors = sf; + init_subpel_params(src_mv, inter_pred_params, subpel_params, width, height); +} + +// Initialize parameters required for inter prediction at block level. +static AOM_INLINE void init_inter_block_params( + InterPredParams *inter_pred_params, int block_width, int block_height, + int pix_row, int pix_col, int subsampling_x, int subsampling_y, + int bit_depth, int use_hbd_buf, int is_intrabc) { + inter_pred_params->block_width = block_width; + inter_pred_params->block_height = block_height; + inter_pred_params->pix_row = pix_row; + inter_pred_params->pix_col = pix_col; + inter_pred_params->subsampling_x = subsampling_x; + inter_pred_params->subsampling_y = subsampling_y; + inter_pred_params->bit_depth = bit_depth; + inter_pred_params->use_hbd_buf = use_hbd_buf; + inter_pred_params->is_intrabc = is_intrabc; + inter_pred_params->mode = TRANSLATION_PRED; + inter_pred_params->comp_mode = UNIFORM_SINGLE; + inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y); + inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x); +} + +// Initialize params required for inter prediction. +static AOM_INLINE void av1_init_inter_params( + InterPredParams *inter_pred_params, int block_width, int block_height, + int pix_row, int pix_col, int subsampling_x, int subsampling_y, + int bit_depth, int use_hbd_buf, int is_intrabc, + const struct scale_factors *sf, const struct buf_2d *ref_buf, + int_interpfilters interp_filters) { + init_inter_block_params(inter_pred_params, block_width, block_height, pix_row, + pix_col, subsampling_x, subsampling_y, bit_depth, + use_hbd_buf, is_intrabc); + init_interp_filter_params(inter_pred_params->interp_filter_params, + &interp_filters.as_filters, block_width, + block_height, is_intrabc); + inter_pred_params->scale_factors = sf; + inter_pred_params->ref_frame_buf = *ref_buf; +} + +static AOM_INLINE void av1_init_comp_mode(InterPredParams *inter_pred_params) { + inter_pred_params->comp_mode = UNIFORM_COMP; +} + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi); + +static INLINE int has_scale(int xs, int ys) { + return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS; +} + +static INLINE void revert_scale_extra_bits(SubpelParams *sp) { + sp->subpel_x >>= SCALE_EXTRA_BITS; + sp->subpel_y >>= SCALE_EXTRA_BITS; + sp->xs >>= SCALE_EXTRA_BITS; + sp->ys >>= SCALE_EXTRA_BITS; + assert(sp->subpel_x < SUBPEL_SHIFTS); + assert(sp->subpel_y < SUBPEL_SHIFTS); + assert(sp->xs <= SUBPEL_SHIFTS); + assert(sp->ys <= SUBPEL_SHIFTS); +} + +static INLINE void inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, int w, int h, + ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) { + assert(conv_params->do_average == 0 || conv_params->do_average == 1); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + if (is_scaled) { + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params); + } else { + SubpelParams sp = *subpel_params; + revert_scale_extra_bits(&sp); + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, + sp.ys, 0, conv_params); + } +} + +static INLINE void highbd_inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, int w, int h, + ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2], + int bd) { + assert(conv_params->do_average == 0 || conv_params->do_average == 1); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + if (is_scaled) { + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, bd); + } else { + SubpelParams sp = *subpel_params; + revert_scale_extra_bits(&sp); + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, + sp.subpel_y, sp.ys, 0, conv_params, bd); + } +} + +void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi); +int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd, int dir); + +static INLINE int is_interinter_compound_used(COMPOUND_TYPE type, + BLOCK_SIZE sb_type) { + const int comp_allowed = is_comp_ref_allowed(sb_type); + switch (type) { + case COMPOUND_AVERAGE: + case COMPOUND_DISTWTD: + case COMPOUND_DIFFWTD: return comp_allowed; + case COMPOUND_WEDGE: + return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0; + default: assert(0); return 0; + } +} + +static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) { + COMPOUND_TYPE comp_type; + int i; + if (!is_comp_ref_allowed(sb_type)) return 0; + for (i = 0; i < COMPOUND_TYPES; i++) { + comp_type = (COMPOUND_TYPE)i; + if (is_masked_compound_type(comp_type) && + is_interinter_compound_used(comp_type, sb_type)) + return 1; + } + return 0; +} + +static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types; +} + +static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types > 0; +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +// TODO(jkoleszar): yet another mv clamping function :-( +static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, + const MV *src_mv, int bw, int bh, + int ss_x, int ss_y) { + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. + const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS; + const int spel_right = spel_left - SUBPEL_SHIFTS; + const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS; + const int spel_bottom = spel_top - SUBPEL_SHIFTS; + MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))), + (int16_t)(src_mv->col * (1 << (1 - ss_x))) }; + assert(ss_x <= 1); + assert(ss_y <= 1); + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom + }; + + clamp_mv(&clamped_mv, &mv_limits); + + return clamped_mv; +} + +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { + int x, y; + if (!sf) { + x = x_offset; + y = y_offset; + } else if (av1_is_scaled(sf)) { + x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS; + y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS; + } else { + x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS; + y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS; + } + return (int64_t)y * stride + x; +} + +static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, + uint8_t *src, int width, int height, + int stride, int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { + // Offset the buffer pointer + if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + mi_row -= 1; + if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + mi_col -= 1; + + const int x = (MI_SIZE * mi_col) >> subsampling_x; + const int y = (MI_SIZE * mi_row) >> subsampling_y; + dst->buf = src + scaled_buffer_offset(x, y, stride, scale); + dst->buf0 = src; + dst->width = width; + dst->height = height; + dst->stride = stride; +} + +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const int plane_start, const int plane_end); + +void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf, const int num_planes); + +static INLINE void set_default_interp_filters( + MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) { + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter)); +} + +static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + if (mbmi->skip_mode) return 0; + if (mbmi->motion_mode == WARPED_CAUSAL) return 0; + if (is_nontrans_global_motion(xd, xd->mi[0])) return 0; + return 1; +} + +// Sets up buffers 'dst_buf1' and 'dst_buf2' from relevant buffers in 'xd' for +// subsequent use in OBMC prediction. +void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, + uint8_t **dst_buf2); + +void av1_setup_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, + const int num_planes); +void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, + uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes); +void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *above[MAX_MB_PLANE], + int above_stride[MAX_MB_PLANE], + uint8_t *left[MAX_MB_PLANE], + int left_stride[MAX_MB_PLANE]); + +const uint8_t *av1_get_obmc_mask(int length); +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd); + +#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) +#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) + +void av1_init_wedge_masks(void); + +static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, + int8_t wedge_sign, + BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index]; +} + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int *fwd_offset, + int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound); + +const uint8_t *av1_get_compound_type_mask( + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); + +// build interintra_predictors for one plane +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize); + +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride); + +void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RECONINTER_H_ diff --git a/third_party/aom/av1/common/reconinter_template.inc b/third_party/aom/av1/common/reconinter_template.inc new file mode 100644 index 0000000000..863c13c112 --- /dev/null +++ b/third_party/aom/av1/common/reconinter_template.inc @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef IS_DEC +#error "IS_DEC must be defined for reconinter_template.inc." +#endif + +#if IS_DEC +static AOM_INLINE void build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *src_mv, + InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, uint8_t **mc_buf) { +#else +static AOM_INLINE void build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *src_mv, + InterPredParams *inter_pred_params) { +#endif // IS_DEC + SubpelParams subpel_params; + uint8_t *src; + int src_stride; +#if IS_DEC + dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y, + ref, mc_buf, &src, &subpel_params, + &src_stride); +#else + enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params, + &src_stride); +#endif // IS_DEC + if (inter_pred_params->comp_mode == UNIFORM_SINGLE || + inter_pred_params->comp_mode == UNIFORM_COMP) { + av1_make_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } else { + av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } +} + +// True if the following hold: +// 1. Not intrabc and not build_for_obmc +// 2. At least one dimension is size 4 with subsampling +// 3. If sub-sampled, none of the previous blocks around the sub-sample +// are intrabc or inter-blocks +static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize, + int is_intrabc, int build_for_obmc) { + if (is_intrabc || build_for_obmc) { + return false; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x; + const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y; + if (!is_sub4_x && !is_sub4_y) { + return false; + } + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = is_sub4_y ? -1 : 0; + const int col_start = is_sub4_x ? -1 : 0; + + for (int row = row_start; row <= 0; ++row) { + for (int col = col_start; col <= 0; ++col) { + const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + if (!is_inter_block(this_mbmi)) return false; + if (is_intrabc_block(this_mbmi)) return false; + } + } + return true; +} + +#if IS_DEC +static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm, + MACROBLOCKD *xd, int plane, + const MB_MODE_INFO *mi, + int mi_x, int mi_y, + uint8_t **mc_buf) { +#else +static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm, + MACROBLOCKD *xd, int plane, + const MB_MODE_INFO *mi, + int mi_x, int mi_y) { +#endif // IS_DEC + const BLOCK_SIZE bsize = mi->bsize; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const bool ss_x = pd->subsampling_x; + const bool ss_y = pd->subsampling_y; + const int b4_w = block_size_wide[bsize] >> ss_x; + const int b4_h = block_size_high[bsize] >> ss_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int b8_w = block_size_wide[plane_bsize]; + const int b8_h = block_size_high[plane_bsize]; + const int is_compound = has_second_ref(mi); + assert(!is_compound); + assert(!is_intrabc_block(mi)); + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; + const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + int row = row_start; + for (int y = 0; y < b8_h; y += b4_h) { + int col = col_start; + for (int x = 0; x < b8_w; x += b4_w) { + MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; + int ref = 0; + const RefCntBuffer *ref_buf = + get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *const sf = ref_scale_factors; + const struct buf_2d pre_buf = { + NULL, + (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer, + ref_buf->buf.uv_crop_width, + ref_buf->buf.uv_crop_height, + ref_buf->buf.uv_stride, + }; + + const MV mv = this_mbmi->mv[ref].as_mv; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y, + pre_x + x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf, + &pre_buf, this_mbmi->interp_filters); + inter_pred_params.conv_params = + get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd); + +#if IS_DEC + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, + xd, mi_x + x, mi_y + y, ref, mc_buf); +#else + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); +#endif // IS_DEC + + ++col; + } + ++row; + } +} + +#if IS_DEC +static AOM_INLINE void build_inter_predictors_8x8_and_bigger( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) { +#else +static AOM_INLINE void build_inter_predictors_8x8_and_bigger( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y) { +#endif // IS_DEC + const int is_compound = has_second_ref(mi); + const int is_intrabc = is_intrabc_block(mi); + assert(IMPLIES(is_intrabc, !is_compound)); + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + + int is_global[2] = { 0, 0 }; + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(mi, wm->wmtype); + } + + const BLOCK_SIZE bsize = mi->bsize; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int row_start = + (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; + const int col_start = + (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + const MV mv = mi->mv[ref].as_mv; + const WarpTypesAllowed warp_types = { is_global[ref], + mi->motion_mode == WARPED_CAUSAL }; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf, + mi->interp_filters); + if (is_compound) av1_init_comp_mode(&inter_pred_params); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + + av1_dist_wtd_comp_weight_assign( + cm, mi, &inter_pred_params.conv_params.fwd_offset, + &inter_pred_params.conv_params.bck_offset, + &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound); + + if (!build_for_obmc) + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + if (is_masked_compound_type(mi->interinter_comp.type)) { + inter_pred_params.sb_type = mi->bsize; + inter_pred_params.mask_comp = mi->interinter_comp; + if (ref == 1) { + inter_pred_params.conv_params.do_average = 0; + inter_pred_params.comp_mode = MASK_COMP; + } + // Assign physical buffer. + inter_pred_params.mask_comp.seg_mask = xd->seg_mask; + } + +#if IS_DEC + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd, + mi_x, mi_y, ref, mc_buf); +#else + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); +#endif // IS_DEC + } +} + +#if IS_DEC +static AOM_INLINE void build_inter_predictors( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) { + if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi), + build_for_obmc)) { + assert(bw < 8 || bh < 8); + build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf); + } else { + build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, + bh, mi_x, mi_y, mc_buf); + } +} +#else +static AOM_INLINE void build_inter_predictors(const AV1_COMMON *cm, + MACROBLOCKD *xd, int plane, + const MB_MODE_INFO *mi, + int build_for_obmc, int bw, + int bh, int mi_x, int mi_y) { + if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi), + build_for_obmc)) { + assert(bw < 8 || bh < 8); + build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y); + } else { + build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, + bh, mi_x, mi_y); + } +} +#endif // IS_DEC diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c new file mode 100644 index 0000000000..20a1e12476 --- /dev/null +++ b/third_party/aom/av1/common/reconintra.c @@ -0,0 +1,1798 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_once.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" + +enum { + NEED_LEFT = 1 << 1, + NEED_ABOVE = 1 << 2, + NEED_ABOVERIGHT = 1 << 3, + NEED_ABOVELEFT = 1 << 4, + NEED_BOTTOMLEFT = 1 << 5, +}; + +#define INTRA_EDGE_FILT 3 +#define INTRA_EDGE_TAPS 5 +#define MAX_UPSAMPLE_SZ 16 +#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32) + +static const uint8_t extend_modes[INTRA_MODES] = { + NEED_ABOVE | NEED_LEFT, // DC + NEED_ABOVE, // V + NEED_LEFT, // H + NEED_ABOVE | NEED_ABOVERIGHT, // D45 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D135 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D113 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D157 + NEED_LEFT | NEED_BOTTOMLEFT, // D203 + NEED_ABOVE | NEED_ABOVERIGHT, // D67 + NEED_LEFT | NEED_ABOVE, // SMOOTH + NEED_LEFT | NEED_ABOVE, // SMOOTH_V + NEED_LEFT | NEED_ABOVE, // SMOOTH_H + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // PAETH +}; + +// Tables to store if the top-right reference pixels are available. The flags +// are represented with bits, packed into 8-bit integers. E.g., for the 32x32 +// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster +// order), so its flag is stored at the 3rd bit of the 2nd entry in the table, +// i.e. (table[10 / 8] >> (10 % 8)) & 1. +// . . . . +// . . . . +// . . o . +// . . . . +static uint8_t has_tr_4x4[128] = { + 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, +}; +static uint8_t has_tr_4x8[64] = { + 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119, + 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, + 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119, + 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, + 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, +}; +static uint8_t has_tr_8x4[64] = { + 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, +}; +static uint8_t has_tr_8x8[32] = { + 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, + 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, +}; +static uint8_t has_tr_8x16[16] = { + 255, 255, 119, 119, 127, 127, 119, 119, + 255, 127, 119, 119, 127, 127, 119, 119, +}; +static uint8_t has_tr_16x8[16] = { + 255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0, +}; +static uint8_t has_tr_16x16[8] = { + 255, 85, 119, 85, 127, 85, 119, 85, +}; +static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 }; +static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 }; +static uint8_t has_tr_32x32[2] = { 95, 87 }; +static uint8_t has_tr_32x64[1] = { 127 }; +static uint8_t has_tr_64x32[1] = { 19 }; +static uint8_t has_tr_64x64[1] = { 7 }; +static uint8_t has_tr_64x128[1] = { 3 }; +static uint8_t has_tr_128x64[1] = { 1 }; +static uint8_t has_tr_128x128[1] = { 1 }; +static uint8_t has_tr_4x16[32] = { + 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255, + 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127, + 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, +}; +static uint8_t has_tr_16x4[32] = { + 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, + 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, +}; +static uint8_t has_tr_8x32[8] = { + 255, 255, 127, 127, 255, 127, 127, 127, +}; +static uint8_t has_tr_32x8[8] = { + 15, 0, 5, 0, 7, 0, 5, 0, +}; +static uint8_t has_tr_16x64[2] = { 255, 127 }; +static uint8_t has_tr_64x16[2] = { 3, 1 }; + +static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = { + // 4X4 + has_tr_4x4, + // 4X8, 8X4, 8X8 + has_tr_4x8, has_tr_8x4, has_tr_8x8, + // 8X16, 16X8, 16X16 + has_tr_8x16, has_tr_16x8, has_tr_16x16, + // 16X32, 32X16, 32X32 + has_tr_16x32, has_tr_32x16, has_tr_32x32, + // 32X64, 64X32, 64X64 + has_tr_32x64, has_tr_64x32, has_tr_64x64, + // 64x128, 128x64, 128x128 + has_tr_64x128, has_tr_128x64, has_tr_128x128, + // 4x16, 16x4, 8x32 + has_tr_4x16, has_tr_16x4, has_tr_8x32, + // 32x8, 16x64, 64x16 + has_tr_32x8, has_tr_16x64, has_tr_64x16 +}; + +static uint8_t has_tr_vert_8x8[32] = { + 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, + 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, +}; +static uint8_t has_tr_vert_16x16[8] = { + 255, 0, 119, 0, 127, 0, 119, 0, +}; +static uint8_t has_tr_vert_32x32[2] = { 15, 7 }; +static uint8_t has_tr_vert_64x64[1] = { 3 }; + +// The _vert_* tables are like the ordinary tables above, but describe the +// order we visit square blocks when doing a PARTITION_VERT_A or +// PARTITION_VERT_B. This is the same order as normal except for on the last +// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block +// as a pair of squares, which means that these tables work correctly for both +// mixed vertical partition types. +// +// There are tables for each of the square sizes. Vertical rectangles (like +// BLOCK_16X32) use their respective "non-vert" table +static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = { + // 4X4 + NULL, + // 4X8, 8X4, 8X8 + has_tr_4x8, NULL, has_tr_vert_8x8, + // 8X16, 16X8, 16X16 + has_tr_8x16, NULL, has_tr_vert_16x16, + // 16X32, 32X16, 32X32 + has_tr_16x32, NULL, has_tr_vert_32x32, + // 32X64, 64X32, 64X64 + has_tr_32x64, NULL, has_tr_vert_64x64, + // 64x128, 128x64, 128x128 + has_tr_64x128, NULL, has_tr_128x128 +}; + +static const uint8_t *get_has_tr_table(PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + const uint8_t *ret = NULL; + // If this is a mixed vertical partition, look up bsize in orders_vert. + if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { + assert(bsize < BLOCK_SIZES); + ret = has_tr_vert_tables[bsize]; + } else { + ret = has_tr_tables[bsize]; + } + assert(ret); + return ret; +} + +static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, + int mi_col, int top_available, int right_available, + PARTITION_TYPE partition, TX_SIZE txsz, int row_off, + int col_off, int ss_x, int ss_y) { + if (!top_available || !right_available) return 0; + + const int bw_unit = mi_size_wide[bsize]; + const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1); + const int top_right_count_unit = tx_size_wide_unit[txsz]; + + if (row_off > 0) { // Just need to check if enough pixels on the right. + if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) { + // Special case: For 128x128 blocks, the transform unit whose + // top-right corner is at the center of the block does in fact have + // pixels available at its top-right corner. + if (row_off == mi_size_high[BLOCK_64X64] >> ss_y && + col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) { + return 1; + } + const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; + const int col_off_64 = col_off % plane_bw_unit_64; + return col_off_64 + top_right_count_unit < plane_bw_unit_64; + } + return col_off + top_right_count_unit < plane_bw_unit; + } else { + // All top-right pixels are in the block above, which is already available. + if (col_off + top_right_count_unit < plane_bw_unit) return 1; + + const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; + const int bh_in_mi_log2 = mi_size_high_log2[bsize]; + const int sb_mi_size = mi_size_high[sb_size]; + const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; + const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; + + // Top row of superblock: so top-right pixels are in the top and/or + // top-right superblocks, both of which are already available. + if (blk_row_in_sb == 0) return 1; + + // Rightmost column of superblock (and not the top row): so top-right pixels + // fall in the right superblock, which is not available yet. + if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) { + return 0; + } + + // General case (neither top row nor rightmost column): check if the + // top-right block is coded before the current block. + const int this_blk_index = + ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + + blk_col_in_sb + 0; + const int idx1 = this_blk_index / 8; + const int idx2 = this_blk_index % 8; + const uint8_t *has_tr_table = get_has_tr_table(partition, bsize); + return (has_tr_table[idx1] >> idx2) & 1; + } +} + +// Similar to the has_tr_* tables, but store if the bottom-left reference +// pixels are available. +static uint8_t has_bl_4x4[128] = { + 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, + 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, + 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, + 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, + 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, + 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, + 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, +}; +static uint8_t has_bl_4x8[64] = { + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, +}; +static uint8_t has_bl_8x4[64] = { + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, +}; +static uint8_t has_bl_8x8[32] = { + 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, + 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, +}; +static uint8_t has_bl_8x16[16] = { + 16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0, +}; +static uint8_t has_bl_16x8[16] = { + 254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0, +}; +static uint8_t has_bl_16x16[8] = { + 84, 16, 84, 0, 84, 16, 84, 0, +}; +static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 }; +static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 }; +static uint8_t has_bl_32x32[2] = { 4, 4 }; +static uint8_t has_bl_32x64[1] = { 0 }; +static uint8_t has_bl_64x32[1] = { 34 }; +static uint8_t has_bl_64x64[1] = { 0 }; +static uint8_t has_bl_64x128[1] = { 0 }; +static uint8_t has_bl_128x64[1] = { 0 }; +static uint8_t has_bl_128x128[1] = { 0 }; +static uint8_t has_bl_4x16[32] = { + 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, + 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, +}; +static uint8_t has_bl_16x4[32] = { + 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, + 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, +}; +static uint8_t has_bl_8x32[8] = { + 0, 1, 0, 0, 0, 1, 0, 0, +}; +static uint8_t has_bl_32x8[8] = { + 238, 78, 238, 14, 238, 78, 238, 14, +}; +static uint8_t has_bl_16x64[2] = { 0, 0 }; +static uint8_t has_bl_64x16[2] = { 42, 42 }; + +static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = { + // 4X4 + has_bl_4x4, + // 4X8, 8X4, 8X8 + has_bl_4x8, has_bl_8x4, has_bl_8x8, + // 8X16, 16X8, 16X16 + has_bl_8x16, has_bl_16x8, has_bl_16x16, + // 16X32, 32X16, 32X32 + has_bl_16x32, has_bl_32x16, has_bl_32x32, + // 32X64, 64X32, 64X64 + has_bl_32x64, has_bl_64x32, has_bl_64x64, + // 64x128, 128x64, 128x128 + has_bl_64x128, has_bl_128x64, has_bl_128x128, + // 4x16, 16x4, 8x32 + has_bl_4x16, has_bl_16x4, has_bl_8x32, + // 32x8, 16x64, 64x16 + has_bl_32x8, has_bl_16x64, has_bl_64x16 +}; + +static uint8_t has_bl_vert_8x8[32] = { + 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, + 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, +}; +static uint8_t has_bl_vert_16x16[8] = { + 254, 16, 254, 0, 254, 16, 254, 0, +}; +static uint8_t has_bl_vert_32x32[2] = { 14, 14 }; +static uint8_t has_bl_vert_64x64[1] = { 2 }; + +// The _vert_* tables are like the ordinary tables above, but describe the +// order we visit square blocks when doing a PARTITION_VERT_A or +// PARTITION_VERT_B. This is the same order as normal except for on the last +// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block +// as a pair of squares, which means that these tables work correctly for both +// mixed vertical partition types. +// +// There are tables for each of the square sizes. Vertical rectangles (like +// BLOCK_16X32) use their respective "non-vert" table +static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = { + // 4X4 + NULL, + // 4X8, 8X4, 8X8 + has_bl_4x8, NULL, has_bl_vert_8x8, + // 8X16, 16X8, 16X16 + has_bl_8x16, NULL, has_bl_vert_16x16, + // 16X32, 32X16, 32X32 + has_bl_16x32, NULL, has_bl_vert_32x32, + // 32X64, 64X32, 64X64 + has_bl_32x64, NULL, has_bl_vert_64x64, + // 64x128, 128x64, 128x128 + has_bl_64x128, NULL, has_bl_128x128 +}; + +static const uint8_t *get_has_bl_table(PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + const uint8_t *ret = NULL; + // If this is a mixed vertical partition, look up bsize in orders_vert. + if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { + assert(bsize < BLOCK_SIZES); + ret = has_bl_vert_tables[bsize]; + } else { + ret = has_bl_tables[bsize]; + } + assert(ret); + return ret; +} + +static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, + int mi_col, int bottom_available, int left_available, + PARTITION_TYPE partition, TX_SIZE txsz, int row_off, + int col_off, int ss_x, int ss_y) { + if (!bottom_available || !left_available) return 0; + + // Special case for 128x* blocks, when col_off is half the block width. + // This is needed because 128x* superblocks are divided into 64x* blocks in + // raster order + if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) { + const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; + const int col_off_64 = col_off % plane_bw_unit_64; + if (col_off_64 == 0) { + // We are at the left edge of top-right or bottom-right 64x* block. + const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y; + const int row_off_64 = row_off % plane_bh_unit_64; + const int plane_bh_unit = + AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64); + // Check if all bottom-left pixels are in the left 64x* block (which is + // already coded). + return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit; + } + } + + if (col_off > 0) { + // Bottom-left pixels are in the bottom-left block, which is not available. + return 0; + } else { + const int bh_unit = mi_size_high[bsize]; + const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1); + const int bottom_left_count_unit = tx_size_high_unit[txsz]; + + // All bottom-left pixels are in the left block, which is already available. + if (row_off + bottom_left_count_unit < plane_bh_unit) return 1; + + const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; + const int bh_in_mi_log2 = mi_size_high_log2[bsize]; + const int sb_mi_size = mi_size_high[sb_size]; + const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; + const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; + + // Leftmost column of superblock: so bottom-left pixels maybe in the left + // and/or bottom-left superblocks. But only the left superblock is + // available, so check if all required pixels fall in that superblock. + if (blk_col_in_sb == 0) { + const int blk_start_row_off = + blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >> + ss_y; + const int row_off_in_sb = blk_start_row_off + row_off; + const int sb_height_unit = sb_mi_size >> ss_y; + return row_off_in_sb + bottom_left_count_unit < sb_height_unit; + } + + // Bottom row of superblock (and not the leftmost column): so bottom-left + // pixels fall in the bottom superblock, which is not available yet. + if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0; + + // General case (neither leftmost column nor bottom row): check if the + // bottom-left block is coded before the current block. + const int this_blk_index = + ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + + blk_col_in_sb + 0; + const int idx1 = this_blk_index / 8; + const int idx2 = this_blk_index % 8; + const uint8_t *has_bl_table = get_has_bl_table(partition, bsize); + return (has_bl_table[idx1] >> idx2) & 1; + } +} + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL]; +static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL]; + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL]; +static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL]; +#endif + +static void init_intra_predictors_internal(void) { + assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES); + +#define INIT_RECTANGULAR(p, type) \ + p[TX_4X8] = aom_##type##_predictor_4x8; \ + p[TX_8X4] = aom_##type##_predictor_8x4; \ + p[TX_8X16] = aom_##type##_predictor_8x16; \ + p[TX_16X8] = aom_##type##_predictor_16x8; \ + p[TX_16X32] = aom_##type##_predictor_16x32; \ + p[TX_32X16] = aom_##type##_predictor_32x16; \ + p[TX_32X64] = aom_##type##_predictor_32x64; \ + p[TX_64X32] = aom_##type##_predictor_64x32; \ + p[TX_4X16] = aom_##type##_predictor_4x16; \ + p[TX_16X4] = aom_##type##_predictor_16x4; \ + p[TX_8X32] = aom_##type##_predictor_8x32; \ + p[TX_32X8] = aom_##type##_predictor_32x8; \ + p[TX_16X64] = aom_##type##_predictor_16x64; \ + p[TX_64X16] = aom_##type##_predictor_64x16; + +#define INIT_NO_4X4(p, type) \ + p[TX_8X8] = aom_##type##_predictor_8x8; \ + p[TX_16X16] = aom_##type##_predictor_16x16; \ + p[TX_32X32] = aom_##type##_predictor_32x32; \ + p[TX_64X64] = aom_##type##_predictor_64x64; \ + INIT_RECTANGULAR(p, type) + +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = aom_##type##_predictor_4x4; \ + INIT_NO_4X4(p, type) + + INIT_ALL_SIZES(pred[V_PRED], v) + INIT_ALL_SIZES(pred[H_PRED], h) + INIT_ALL_SIZES(pred[PAETH_PRED], paeth) + INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth) + INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v) + INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h) + INIT_ALL_SIZES(dc_pred[0][0], dc_128) + INIT_ALL_SIZES(dc_pred[0][1], dc_top) + INIT_ALL_SIZES(dc_pred[1][0], dc_left) + INIT_ALL_SIZES(dc_pred[1][1], dc) +#if CONFIG_AV1_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], highbd_v) + INIT_ALL_SIZES(pred_high[H_PRED], highbd_h) + INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth) + INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth) + INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v) + INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h) + INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128) + INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top) + INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left) + INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc) +#endif +#undef intra_pred_allsizes +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + int r, c, x, base, shift, val; + + (void)left; + (void)dy; + assert(dy == 1); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << upsample_above; + const int frac_bits = 6 - upsample_above; + const int base_inc = 1 << upsample_above; + x = dx; + for (r = 0; r < bh; ++r, dst += stride, x += dx) { + base = x >> frac_bits; + shift = ((x << upsample_above) & 0x3F) >> 1; + + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + memset(dst, above[max_base_x], bw * sizeof(dst[0])); + dst += stride; + } + return; + } + + for (c = 0; c < bw; ++c, base += base_inc) { + if (base < max_base_x) { + val = above[base] * (32 - shift) + above[base + 1] * shift; + dst[c] = ROUND_POWER_OF_TWO(val, 5); + } else { + dst[c] = above[max_base_x]; + } + } + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } else { + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } + dst[c] = val; + } + dst += stride; + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + int r, c, y, base, shift, val; + + (void)above; + (void)dx; + + assert(dx == 1); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << upsample_left; + const int frac_bits = 6 - upsample_left; + const int base_inc = 1 << upsample_left; + y = dy; + for (c = 0; c < bw; ++c, y += dy) { + base = y >> frac_bits; + shift = ((y << upsample_left) & 0x3F) >> 1; + + for (r = 0; r < bh; ++r, base += base_inc) { + if (base < max_base_y) { + val = left[base] * (32 - shift) + left[base + 1] * shift; + dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); + } else { + for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; + break; + } + } + } +} + +static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int angle) { + const int dx = av1_get_dx(angle); + const int dy = av1_get_dy(angle); + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + assert(angle > 0 && angle < 270); + + if (angle > 0 && angle < 90) { + av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx, + dy); + } else if (angle > 90 && angle < 180) { + av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above, + upsample_left, dx, dy); + } else if (angle > 180 && angle < 270) { + av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx, + dy); + } else if (angle == 90) { + pred[V_PRED][tx_size](dst, stride, above, left); + } else if (angle == 180) { + pred[H_PRED][tx_size](dst, stride, above, left); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + int r, c, x, base, shift, val; + + (void)left; + (void)dy; + (void)bd; + assert(dy == 1); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << upsample_above; + const int frac_bits = 6 - upsample_above; + const int base_inc = 1 << upsample_above; + x = dx; + for (r = 0; r < bh; ++r, dst += stride, x += dx) { + base = x >> frac_bits; + shift = ((x << upsample_above) & 0x3F) >> 1; + + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above[max_base_x], bw); + dst += stride; + } + return; + } + + for (c = 0; c < bw; ++c, base += base_inc) { + if (base < max_base_x) { + val = above[base] * (32 - shift) + above[base + 1] * shift; + dst[c] = ROUND_POWER_OF_TWO(val, 5); + } else { + dst[c] = above[max_base_x]; + } + } + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } else { + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } + dst[c] = val; + } + dst += stride; + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + int r, c, y, base, shift, val; + + (void)above; + (void)dx; + (void)bd; + assert(dx == 1); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << upsample_left; + const int frac_bits = 6 - upsample_left; + const int base_inc = 1 << upsample_left; + y = dy; + for (c = 0; c < bw; ++c, y += dy) { + base = y >> frac_bits; + shift = ((y << upsample_left) & 0x3F) >> 1; + + for (r = 0; r < bh; ++r, base += base_inc) { + if (base < max_base_y) { + val = left[base] * (32 - shift) + left[base + 1] * shift; + dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); + } else { + for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; + break; + } + } + } +} + +static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int angle, int bd) { + const int dx = av1_get_dx(angle); + const int dy = av1_get_dy(angle); + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + assert(angle > 0 && angle < 270); + + if (angle > 0 && angle < 90) { + av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left, + upsample_above, dx, dy, bd); + } else if (angle > 90 && angle < 180) { + av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left, + upsample_above, upsample_left, dx, dy, bd); + } else if (angle > 180 && angle < 270) { + av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, + dx, dy, bd); + } else if (angle == 90) { + pred_high[V_PRED][tx_size](dst, stride, above, left, bd); + } else if (angle == 180) { + pred_high[H_PRED][tx_size](dst, stride, above, left, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +DECLARE_ALIGNED(16, const int8_t, + av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = { + { + { -6, 10, 0, 0, 0, 12, 0, 0 }, + { -5, 2, 10, 0, 0, 9, 0, 0 }, + { -3, 1, 1, 10, 0, 7, 0, 0 }, + { -3, 1, 1, 2, 10, 5, 0, 0 }, + { -4, 6, 0, 0, 0, 2, 12, 0 }, + { -3, 2, 6, 0, 0, 2, 9, 0 }, + { -3, 2, 2, 6, 0, 2, 7, 0 }, + { -3, 1, 2, 2, 6, 3, 5, 0 }, + }, + { + { -10, 16, 0, 0, 0, 10, 0, 0 }, + { -6, 0, 16, 0, 0, 6, 0, 0 }, + { -4, 0, 0, 16, 0, 4, 0, 0 }, + { -2, 0, 0, 0, 16, 2, 0, 0 }, + { -10, 16, 0, 0, 0, 0, 10, 0 }, + { -6, 0, 16, 0, 0, 0, 6, 0 }, + { -4, 0, 0, 16, 0, 0, 4, 0 }, + { -2, 0, 0, 0, 16, 0, 2, 0 }, + }, + { + { -8, 8, 0, 0, 0, 16, 0, 0 }, + { -8, 0, 8, 0, 0, 16, 0, 0 }, + { -8, 0, 0, 8, 0, 16, 0, 0 }, + { -8, 0, 0, 0, 8, 16, 0, 0 }, + { -4, 4, 0, 0, 0, 0, 16, 0 }, + { -4, 0, 4, 0, 0, 0, 16, 0 }, + { -4, 0, 0, 4, 0, 0, 16, 0 }, + { -4, 0, 0, 0, 4, 0, 16, 0 }, + }, + { + { -2, 8, 0, 0, 0, 10, 0, 0 }, + { -1, 3, 8, 0, 0, 6, 0, 0 }, + { -1, 2, 3, 8, 0, 4, 0, 0 }, + { 0, 1, 2, 3, 8, 2, 0, 0 }, + { -1, 4, 0, 0, 0, 3, 10, 0 }, + { -1, 3, 4, 0, 0, 4, 6, 0 }, + { -1, 2, 3, 4, 0, 4, 4, 0 }, + { -1, 2, 2, 3, 4, 3, 3, 0 }, + }, + { + { -12, 14, 0, 0, 0, 14, 0, 0 }, + { -10, 0, 14, 0, 0, 12, 0, 0 }, + { -9, 0, 0, 14, 0, 11, 0, 0 }, + { -8, 0, 0, 0, 14, 10, 0, 0 }, + { -10, 12, 0, 0, 0, 0, 14, 0 }, + { -9, 1, 12, 0, 0, 0, 12, 0 }, + { -8, 0, 0, 12, 0, 1, 11, 0 }, + { -7, 0, 0, 1, 12, 1, 9, 0 }, + }, +}; + +void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + int r, c; + uint8_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); + + for (r = 1; r < bh + 1; r += 2) + for (c = 1; c < bw + 1; c += 4) { + const uint8_t p0 = buffer[r - 1][c - 1]; + const uint8_t p1 = buffer[r - 1][c]; + const uint8_t p2 = buffer[r - 1][c + 1]; + const uint8_t p3 = buffer[r - 1][c + 2]; + const uint8_t p4 = buffer[r - 1][c + 3]; + const uint8_t p5 = buffer[r][c - 1]; + const uint8_t p6 = buffer[r + 1][c - 1]; + for (int k = 0; k < 8; ++k) { + int r_offset = k >> 2; + int c_offset = k & 0x03; + int pr = av1_filter_intra_taps[mode][k][0] * p0 + + av1_filter_intra_taps[mode][k][1] * p1 + + av1_filter_intra_taps[mode][k][2] * p2 + + av1_filter_intra_taps[mode][k][3] * p3 + + av1_filter_intra_taps[mode][k][4] * p4 + + av1_filter_intra_taps[mode][k][5] * p5 + + av1_filter_intra_taps[mode][k][6] * p6; + // Section 7.11.2.3 specifies the right-hand side of the assignment as + // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). + // Since Clip1() clips a negative value to 0, it is safe to replace + // Round2Signed() with Round2(). + buffer[r + r_offset][c + c_offset] = + clip_pixel(ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS)); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); + dst += stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, + const uint16_t *above, + const uint16_t *left, int mode, + int bd) { + int r, c; + uint16_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0])); + + for (r = 1; r < bh + 1; r += 2) + for (c = 1; c < bw + 1; c += 4) { + const uint16_t p0 = buffer[r - 1][c - 1]; + const uint16_t p1 = buffer[r - 1][c]; + const uint16_t p2 = buffer[r - 1][c + 1]; + const uint16_t p3 = buffer[r - 1][c + 2]; + const uint16_t p4 = buffer[r - 1][c + 3]; + const uint16_t p5 = buffer[r][c - 1]; + const uint16_t p6 = buffer[r + 1][c - 1]; + for (int k = 0; k < 8; ++k) { + int r_offset = k >> 2; + int c_offset = k & 0x03; + int pr = av1_filter_intra_taps[mode][k][0] * p0 + + av1_filter_intra_taps[mode][k][1] * p1 + + av1_filter_intra_taps[mode][k][2] * p2 + + av1_filter_intra_taps[mode][k][3] * p3 + + av1_filter_intra_taps[mode][k][4] * p4 + + av1_filter_intra_taps[mode][k][5] * p5 + + av1_filter_intra_taps[mode][k][6] * p6; + // Section 7.11.2.3 specifies the right-hand side of the assignment as + // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). + // Since Clip1() clips a negative value to 0, it is safe to replace + // Round2Signed() with Round2(). + buffer[r + r_offset][c + c_offset] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0])); + dst += stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int is_smooth(const MB_MODE_INFO *mbmi, int plane) { + if (plane == 0) { + const PREDICTION_MODE mode = mbmi->mode; + return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED); + } else { + // uv_mode is not set for inter blocks, so need to explicitly + // detect that case. + if (is_inter_block(mbmi)) return 0; + + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED || + uv_mode == UV_SMOOTH_H_PRED); + } +} + +static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) { + const MB_MODE_INFO *above; + const MB_MODE_INFO *left; + + if (plane == 0) { + above = xd->above_mbmi; + left = xd->left_mbmi; + } else { + above = xd->chroma_above_mbmi; + left = xd->chroma_left_mbmi; + } + + return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane)); +} + +static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { + const int d = abs(delta); + int strength = 0; + + const int blk_wh = bs0 + bs1; + if (type == 0) { + if (blk_wh <= 8) { + if (d >= 56) strength = 1; + } else if (blk_wh <= 12) { + if (d >= 40) strength = 1; + } else if (blk_wh <= 16) { + if (d >= 40) strength = 1; + } else if (blk_wh <= 24) { + if (d >= 8) strength = 1; + if (d >= 16) strength = 2; + if (d >= 32) strength = 3; + } else if (blk_wh <= 32) { + if (d >= 1) strength = 1; + if (d >= 4) strength = 2; + if (d >= 32) strength = 3; + } else { + if (d >= 1) strength = 3; + } + } else { + if (blk_wh <= 8) { + if (d >= 40) strength = 1; + if (d >= 64) strength = 2; + } else if (blk_wh <= 16) { + if (d >= 20) strength = 1; + if (d >= 48) strength = 2; + } else if (blk_wh <= 24) { + if (d >= 4) strength = 3; + } else { + if (d >= 1) strength = 3; + } + } + return strength; +} + +void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) { + if (!strength) return; + + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; + const int filt = strength - 1; + uint8_t edge[129]; + + memcpy(edge, p, sz * sizeof(*p)); + for (int i = 1; i < sz; i++) { + int s = 0; + for (int j = 0; j < INTRA_EDGE_TAPS; j++) { + int k = i - 2 + j; + k = (k < 0) ? 0 : k; + k = (k > sz - 1) ? sz - 1 : k; + s += edge[k] * kernel[filt][j]; + } + s = (s + 8) >> 4; + p[i] = s; + } +} + +static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) { + const int kernel[3] = { 5, 6, 5 }; + + int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + + (p_above[0] * kernel[2]); + s = (s + 8) >> 4; + p_above[-1] = s; + p_left[-1] = s; +} + +void av1_upsample_intra_edge_c(uint8_t *p, int sz) { + // interpolate half-sample positions + assert(sz <= MAX_UPSAMPLE_SZ); + + uint8_t in[MAX_UPSAMPLE_SZ + 3]; + // copy p[-1..(sz-1)] and extend first and last samples + in[0] = p[-1]; + in[1] = p[-1]; + for (int i = 0; i < sz; i++) { + in[i + 2] = p[i]; + } + in[sz + 2] = p[sz - 1]; + + // interpolate half-sample edge positions + p[-2] = in[0]; + for (int i = 0; i < sz; i++) { + int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; + s = clip_pixel((s + 8) >> 4); + p[2 * i - 1] = s; + p[2 * i] = in[i + 2]; + } +} + +static void build_directional_and_filter_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) { + int i; + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const int is_dr_mode = av1_is_directional_mode(mode); + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + assert(use_filter_intra || is_dr_mode); + // The left_data, above_data buffers must be zeroed to fix some intermittent + // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 + // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to + // be the potential reason for this issue. + memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); + memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); + + // The default values if ref pixels are not available: + // 128 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + if (is_dr_mode) { + if (p_angle <= 90) + need_above = 1, need_left = 0, need_above_left = 1; + else if (p_angle < 180) + need_above = 1, need_left = 1, need_above_left = 1; + else + need_above = 0, need_left = 1, need_above_left = 1; + } + if (use_filter_intra) need_left = need_above = need_above_left = 1; + + assert(n_top_px >= 0); + assert(n_topright_px >= -1); + assert(n_left_px >= 0); + assert(n_bottomleft_px >= -1); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + // NEED_LEFT + if (need_left) { + const int num_left_pixels_needed = + txhpx + (n_bottomleft_px >= 0 ? txwpx : 0); + i = 0; + if (n_left_px > 0) { + for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (n_bottomleft_px > 0) { + assert(i == txhpx); + for (; i < txhpx + n_bottomleft_px; i++) + left_col[i] = left_ref[i * ref_stride]; + } + if (i < num_left_pixels_needed) + memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); + } else if (n_top_px > 0) { + memset(left_col, above_ref[0], num_left_pixels_needed); + } + } + + // NEED_ABOVE + if (need_above) { + const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (n_topright_px > 0) { + assert(n_top_px == txwpx); + memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px); + i += n_topright_px; + } + if (i < num_top_pixels_needed) + memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i); + } else if (n_left_px > 0) { + memset(above_row, left_ref[0], num_top_pixels_needed); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + + if (use_filter_intra) { + av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, + filter_intra_mode); + return; + } + + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_upsample_intra_edge(above_row, n_px); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_upsample_intra_edge(left_col, n_px); + } + } + dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, + upsample_left, p_angle); +} + +// This function generates the pred data of a given block for non-directional +// intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, SMOOTH_V and PAETH). +static void build_non_directional_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) { + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int need_left = extend_modes[mode] & NEED_LEFT; + const int need_above = extend_modes[mode] & NEED_ABOVE; + const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + int i = 0; + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + + if (need_left) { + memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + memset(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i); + } else if (n_left_px > 0) { + memset(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + + if (mode == DC_PRED) { + dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row, + left_col); + } else { + pred[mode][tx_size](dst, dst_stride, above_row, left_col); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength) { + if (!strength) return; + + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; + const int filt = strength - 1; + uint16_t edge[129]; + + memcpy(edge, p, sz * sizeof(*p)); + for (int i = 1; i < sz; i++) { + int s = 0; + for (int j = 0; j < INTRA_EDGE_TAPS; j++) { + int k = i - 2 + j; + k = (k < 0) ? 0 : k; + k = (k > sz - 1) ? sz - 1 : k; + s += edge[k] * kernel[filt][j]; + } + s = (s + 8) >> 4; + p[i] = s; + } +} + +static void highbd_filter_intra_edge_corner(uint16_t *p_above, + uint16_t *p_left) { + const int kernel[3] = { 5, 6, 5 }; + + int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + + (p_above[0] * kernel[2]); + s = (s + 8) >> 4; + p_above[-1] = s; + p_left[-1] = s; +} + +void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) { + // interpolate half-sample positions + assert(sz <= MAX_UPSAMPLE_SZ); + + uint16_t in[MAX_UPSAMPLE_SZ + 3]; + // copy p[-1..(sz-1)] and extend first and last samples + in[0] = p[-1]; + in[1] = p[-1]; + for (int i = 0; i < sz; i++) { + in[i + 2] = p[i]; + } + in[sz + 2] = p[sz - 1]; + + // interpolate half-sample edge positions + p[-2] = in[0]; + for (int i = 0; i < sz; i++) { + int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; + s = (s + 8) >> 4; + s = clip_pixel_highbd(s, bd); + p[2 * i - 1] = s; + p[2 * i] = in[i + 2]; + } +} + +static void highbd_build_intra_predictors( + const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, + PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, int intra_edge_filter_type, + int bit_depth) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint16_t *const above_row = above_data + 16; + uint16_t *const left_col = left_data + 16; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const uint16_t *above_ref = ref - ref_stride; + const uint16_t *left_ref = ref - 1; + const int is_dr_mode = av1_is_directional_mode(mode); + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + int base = 128 << (bit_depth - 8); + // The left_data, above_data buffers must be zeroed to fix some intermittent + // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 + // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are + // seen to be the potential reason for this issue. + aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); + aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); + + // The default values if ref pixels are not available: + // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1 + // base+1 A B .. Y Z + // base+1 C D .. W X + // base+1 E F .. U V + // base+1 G H .. S T T T T T + + if (is_dr_mode) { + if (p_angle <= 90) + need_above = 1, need_left = 0, need_above_left = 1; + else if (p_angle < 180) + need_above = 1, need_left = 1, need_above_left = 1; + else + need_above = 0, need_left = 1, need_above_left = 1; + } + if (use_filter_intra) need_left = need_above = need_above_left = 1; + + assert(n_top_px >= 0); + assert(n_topright_px >= -1); + assert(n_left_px >= 0); + assert(n_bottomleft_px >= -1); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : base + 1; + } else { + val = (n_left_px > 0) ? left_ref[0] : base - 1; + } + for (i = 0; i < txhpx; ++i) { + aom_memset16(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + // NEED_LEFT + if (need_left) { + const int num_left_pixels_needed = + txhpx + (n_bottomleft_px >= 0 ? txwpx : 0); + i = 0; + if (n_left_px > 0) { + for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (n_bottomleft_px > 0) { + assert(i == txhpx); + for (; i < txhpx + n_bottomleft_px; i++) + left_col[i] = left_ref[i * ref_stride]; + } + if (i < num_left_pixels_needed) + aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); + } else if (n_top_px > 0) { + aom_memset16(left_col, above_ref[0], num_left_pixels_needed); + } + } + + // NEED_ABOVE + if (need_above) { + const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); + i = n_top_px; + if (n_topright_px > 0) { + assert(n_top_px == txwpx); + memcpy(above_row + txwpx, above_ref + txwpx, + n_topright_px * sizeof(above_ref[0])); + i += n_topright_px; + } + if (i < num_top_pixels_needed) + aom_memset16(&above_row[i], above_row[i - 1], + num_top_pixels_needed - i); + } else if (n_left_px > 0) { + aom_memset16(above_row, left_ref[0], num_top_pixels_needed); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = base; + } + left_col[-1] = above_row[-1]; + } + + if (use_filter_intra) { + highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, + filter_intra_mode, bit_depth); + return; + } + + if (is_dr_mode) { + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + highbd_filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + } + } + highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, + upsample_above, upsample_left, p_angle, bit_depth); + return; + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( + dst, dst_stride, above_row, left_col, bit_depth); + } else { + pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + BLOCK_SIZE bs = bsize; + switch (bsize) { + case BLOCK_4X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_4X8: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X8; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_8X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_8X8; + break; + case BLOCK_4X16: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X16; + else if (subsampling_x == 1) + bs = BLOCK_8X16; + else if (subsampling_y == 1) + bs = BLOCK_4X16; + break; + case BLOCK_16X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_16X8; + else if (subsampling_x == 1) + bs = BLOCK_16X4; + else if (subsampling_y == 1) + bs = BLOCK_16X8; + break; + default: break; + } + return bs; +} + +void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, + int enable_intra_edge_filter, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, + int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, + const uint8_t *ref, int ref_stride, uint8_t *dst, + int dst_stride, int col_off, int row_off, + int plane) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int x = col_off << MI_SIZE_LOG2; + const int y = row_off << MI_SIZE_LOG2; + const int is_hbd = is_cur_buf_hbd(xd); + + assert(mode < INTRA_MODES); + + if (use_palette) { + int r, c; + const uint8_t *const map = xd->plane[plane != 0].color_index_map + + xd->color_index_map_offset[plane != 0]; + const uint16_t *const palette = + mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; + if (is_hbd) { + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (r = 0; r < txhpx; ++r) { + for (c = 0; c < txwpx; ++c) { + dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]]; + } + } + } else { + for (r = 0; r < txhpx; ++r) { + for (c = 0; c < txwpx; ++c) { + dst[r * dst_stride + c] = + (uint8_t)palette[map[(r + y) * wpx + c + x]]; + } + } + } + return; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int have_top = + row_off || (ss_y ? xd->chroma_up_available : xd->up_available); + const int have_left = + col_off || (ss_x ? xd->chroma_left_available : xd->left_available); + + // Distance between the right edge of this prediction block to + // the frame right edge + const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx; + // Distance between the bottom edge of this prediction block to + // the frame bottom edge + const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx; + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + const int is_dr_mode = av1_is_directional_mode(mode); + + // The computations in this function, as well as in build_intra_predictors(), + // are generalized for all intra modes. Some of these operations are not + // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H, + // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a + // separate function build_non_directional_intra_predictors() is introduced + // for these modes to avoid redundant computations while generating pred data. + + // TODO(aomedia:3532): Enable this refactoring for high bd path as well. + if (!is_hbd && !use_filter_intra && !is_dr_mode) { + build_non_directional_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0); + return; + } + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + const int right_available = + mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; + const int bottom_available = + (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end); + + const PARTITION_TYPE partition = mbmi->partition; + + BLOCK_SIZE bsize = mbmi->bsize; + // force 4x4 chroma component block size. + if (ss_x || ss_y) { + bsize = scale_chroma_bsize(bsize, ss_x, ss_y); + } + + int p_angle = 0; + int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT; + int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT; + + if (use_filter_intra) { + need_top_right = 0; + need_bottom_left = 0; + } + if (is_dr_mode) { + p_angle = mode_to_angle_map[mode] + angle_delta; + need_top_right = p_angle < 90; + need_bottom_left = p_angle > 180; + } + + // Possible states for have_top_right(TR) and have_bottom_left(BL) + // -1 : TR and BL are not needed + // 0 : TR and BL are needed but not available + // > 0 : TR and BL are needed and pixels are available + const int have_top_right = + need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top, + right_available, partition, tx_size, + row_off, col_off, ss_x, ss_y) + : -1; + const int have_bottom_left = + need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col, + bottom_available, have_left, partition, + tx_size, row_off, col_off, ss_x, ss_y) + : -1; + + const int disable_edge_filter = !enable_intra_edge_filter; + const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + highbd_build_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, + tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, + intra_edge_filter_type, xd->bd); + return; + } +#endif + build_directional_and_filter_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, + tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, + intra_edge_filter_type); +} + +void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, int blk_col, int blk_row, + TX_SIZE tx_size) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + const PREDICTION_MODE mode = + (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); + const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0; + const FILTER_INTRA_MODE filter_intra_mode = + (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra) + ? mbmi->filter_intra_mode_info.filter_intra_mode + : FILTER_INTRA_MODES; + const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP; + const SequenceHeader *seq_params = cm->seq_params; + + if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) { +#if CONFIG_DEBUG + assert(is_cfl_allowed(xd)); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + (void)plane_bsize; + assert(plane_bsize < BLOCK_SIZES_ALL); + if (!xd->lossless[mbmi->segment_id]) { + assert(blk_col == 0); + assert(blk_row == 0); + assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); + assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); + } +#endif + CFL_CTX *const cfl = &xd->cfl; + CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane); + if (!cfl->dc_pred_is_cached[pred_plane]) { + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mode, angle_delta, + use_palette, filter_intra_mode, dst, dst_stride, + dst, dst_stride, blk_col, blk_row, plane); + if (cfl->use_dc_pred_cache) { + cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]); + cfl->dc_pred_is_cached[pred_plane] = true; + } + } else { + cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane); + } + av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane); + return; + } + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode, + dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane); +} + +void av1_init_intra_predictors(void) { + aom_once(init_intra_predictors_internal); +} diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h new file mode 100644 index 0000000000..fa66ccd541 --- /dev/null +++ b/third_party/aom/av1/common/reconintra.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RECONINTRA_H_ +#define AOM_AV1_COMMON_RECONINTRA_H_ + +#include + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_init_intra_predictors(void); +void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, int blk_col, int blk_row, + TX_SIZE tx_size); +void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, + int enable_intra_edge_filter, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, + int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, + const uint8_t *ref, int ref_stride, uint8_t *dst, + int dst_stride, int col_off, int row_off, + int plane); + +// Mapping of interintra to intra mode for use in the intra component +static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = { + DC_PRED, V_PRED, H_PRED, SMOOTH_PRED +}; + +// Mapping of intra mode to the interintra mode +static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = { + II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_V_PRED, + II_H_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED +}; + +#define FILTER_INTRA_SCALE_BITS 4 + +static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) { + return mode >= V_PRED && mode <= D67_PRED; +} + +static INLINE int av1_is_diagonal_mode(PREDICTION_MODE mode) { + return mode >= D45_PRED && mode <= D67_PRED; +} + +static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) { + return bsize >= BLOCK_8X8; +} + +static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) { + return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools && + cm->features.allow_intrabc; +} + +static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm, + BLOCK_SIZE bs) { + if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0; + + return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32; +} + +static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm, + const MB_MODE_INFO *mbmi) { + return mbmi->mode == DC_PRED && + mbmi->palette_mode_info.palette_size[0] == 0 && + av1_filter_intra_allowed_bsize(cm, mbmi->bsize); +} + +extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]; + +static const int16_t dr_intra_derivative[90] = { + // More evenly spread out angles and limited to 10-bit + // Values that are 0 will never be used + // Approx angle + 0, 0, 0, // + 1023, 0, 0, // 3, ... + 547, 0, 0, // 6, ... + 372, 0, 0, 0, 0, // 9, ... + 273, 0, 0, // 14, ... + 215, 0, 0, // 17, ... + 178, 0, 0, // 20, ... + 151, 0, 0, // 23, ... (113 & 203 are base angles) + 132, 0, 0, // 26, ... + 116, 0, 0, // 29, ... + 102, 0, 0, 0, // 32, ... + 90, 0, 0, // 36, ... + 80, 0, 0, // 39, ... + 71, 0, 0, // 42, ... + 64, 0, 0, // 45, ... (45 & 135 are base angles) + 57, 0, 0, // 48, ... + 51, 0, 0, // 51, ... + 45, 0, 0, 0, // 54, ... + 40, 0, 0, // 58, ... + 35, 0, 0, // 61, ... + 31, 0, 0, // 64, ... + 27, 0, 0, // 67, ... (67 & 157 are base angles) + 23, 0, 0, // 70, ... + 19, 0, 0, // 73, ... + 15, 0, 0, 0, 0, // 76, ... + 11, 0, 0, // 81, ... + 7, 0, 0, // 84, ... + 3, 0, 0, // 87, ... +}; + +// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y. +// If angle > 0 && angle < 90, dx = -((int)(256 / t)); +// If angle > 90 && angle < 180, dx = (int)(256 / t); +// If angle > 180 && angle < 270, dx = 1; +static INLINE int av1_get_dx(int angle) { + if (angle > 0 && angle < 90) { + return dr_intra_derivative[angle]; + } else if (angle > 90 && angle < 180) { + return dr_intra_derivative[180 - angle]; + } else { + // In this case, we are not really going to use dx. We may return any value. + return 1; + } +} + +// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X. +// If angle > 0 && angle < 90, dy = 1; +// If angle > 90 && angle < 180, dy = (int)(256 * t); +// If angle > 180 && angle < 270, dy = -((int)(256 * t)); +static INLINE int av1_get_dy(int angle) { + if (angle > 90 && angle < 180) { + return dr_intra_derivative[angle - 90]; + } else if (angle > 180 && angle < 270) { + return dr_intra_derivative[270 - angle]; + } else { + // In this case, we are not really going to use dy. We may return any value. + return 1; + } +} + +static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, + int type) { + const int d = abs(delta); + const int blk_wh = bs0 + bs1; + if (d == 0 || d >= 40) return 0; + return type ? (blk_wh <= 8) : (blk_wh <= 16); +} +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_COMMON_RECONINTRA_H_ diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c new file mode 100644 index 0000000000..1b348836a5 --- /dev/null +++ b/third_party/aom/av1/common/resize.c @@ -0,0 +1,1452 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "av1/common/common.h" +#include "av1/common/resize.h" + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +// Filters for interpolation (0.5-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = { + { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, 0, 34, 64, 36, 0, -3, 0 }, + { -3, -1, 34, 64, 36, 1, -3, 0 }, { -3, -1, 33, 64, 37, 1, -3, 0 }, + { -3, -1, 32, 64, 38, 1, -3, 0 }, { -3, -1, 31, 64, 39, 1, -3, 0 }, + { -3, -1, 31, 63, 39, 2, -3, 0 }, { -2, -2, 30, 63, 40, 2, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 29, 63, 41, 3, -4, 0 }, + { -2, -2, 28, 63, 42, 3, -4, 0 }, { -2, -2, 27, 63, 43, 3, -4, 0 }, + { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 26, 62, 44, 5, -4, 0 }, + { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 }, + { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 22, 61, 48, 7, -4, -1 }, + { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 49, 8, -4, 0 }, + { -1, -4, 20, 60, 50, 8, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 }, + { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 18, 58, 52, 10, -4, -1 }, + { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 }, + { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 }, + { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 }, + { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 }, + { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 }, + { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 }, + { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 }, + { -1, -4, 8, 50, 60, 20, -4, -1 }, { 0, -4, 8, 49, 60, 20, -4, -1 }, + { 0, -4, 7, 49, 60, 21, -3, -2 }, { -1, -4, 7, 48, 61, 22, -3, -2 }, + { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 }, + { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 44, 62, 26, -3, -2 }, + { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 43, 63, 27, -2, -2 }, + { 0, -4, 3, 42, 63, 28, -2, -2 }, { 0, -4, 3, 41, 63, 29, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 40, 63, 30, -2, -2 }, + { 0, -3, 2, 39, 63, 31, -1, -3 }, { 0, -3, 1, 39, 64, 31, -1, -3 }, + { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 37, 64, 33, -1, -3 }, + { 0, -3, 1, 36, 64, 34, -1, -3 }, { 0, -3, 0, 36, 64, 34, 0, -3 }, +}; + +// Filters for interpolation (0.625-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = { + { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 }, + { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 }, + { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 }, + { 0, -8, 26, 79, 39, -7, -2, 1 }, { 0, -8, 25, 79, 40, -7, -2, 1 }, + { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 23, 78, 42, -6, -2, 1 }, + { 0, -8, 22, 78, 43, -6, -2, 1 }, { 0, -8, 21, 78, 44, -6, -2, 1 }, + { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 19, 77, 47, -5, -3, 1 }, + { 0, -8, 18, 77, 48, -5, -3, 1 }, { 0, -8, 17, 77, 49, -5, -3, 1 }, + { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 76, 51, -4, -3, 1 }, + { 0, -8, 15, 75, 52, -3, -4, 1 }, { 0, -7, 14, 74, 53, -3, -4, 1 }, + { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 12, 73, 55, -2, -4, 1 }, + { 0, -7, 11, 73, 56, -2, -4, 1 }, { 0, -7, 10, 72, 57, -1, -4, 1 }, + { 1, -7, 10, 71, 58, -1, -5, 1 }, { 0, -7, 9, 71, 59, 0, -5, 1 }, + { 1, -7, 8, 70, 60, 0, -5, 1 }, { 1, -7, 7, 69, 61, 1, -5, 1 }, + { 1, -6, 6, 68, 62, 1, -5, 1 }, { 0, -6, 6, 68, 62, 2, -5, 1 }, + { 1, -6, 5, 67, 63, 2, -5, 1 }, { 1, -6, 5, 66, 64, 3, -6, 1 }, + { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -6, 3, 64, 66, 5, -6, 1 }, + { 1, -5, 2, 63, 67, 5, -6, 1 }, { 1, -5, 2, 62, 68, 6, -6, 0 }, + { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 1, 61, 69, 7, -7, 1 }, + { 1, -5, 0, 60, 70, 8, -7, 1 }, { 1, -5, 0, 59, 71, 9, -7, 0 }, + { 1, -5, -1, 58, 71, 10, -7, 1 }, { 1, -4, -1, 57, 72, 10, -7, 0 }, + { 1, -4, -2, 56, 73, 11, -7, 0 }, { 1, -4, -2, 55, 73, 12, -7, 0 }, + { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 53, 74, 14, -7, 0 }, + { 1, -4, -3, 52, 75, 15, -8, 0 }, { 1, -3, -4, 51, 76, 15, -8, 0 }, + { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 49, 77, 17, -8, 0 }, + { 1, -3, -5, 48, 77, 18, -8, 0 }, { 1, -3, -5, 47, 77, 19, -8, 0 }, + { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 44, 78, 21, -8, 0 }, + { 1, -2, -6, 43, 78, 22, -8, 0 }, { 1, -2, -6, 42, 78, 23, -8, 0 }, + { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 40, 79, 25, -8, 0 }, + { 1, -2, -7, 39, 79, 26, -8, 0 }, { 1, -2, -7, 38, 80, 27, -8, -1 }, + { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 }, + { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 }, +}; + +// Filters for interpolation (0.75-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = { + { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 }, + { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 }, + { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 }, + { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 }, + { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -9, 13, 94, 38, -12, 2, 0 }, + { 2, -8, 12, 93, 40, -12, 1, 0 }, { 2, -8, 11, 93, 41, -12, 1, 0 }, + { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -8, 8, 92, 44, -12, 1, 1 }, + { 2, -7, 7, 91, 46, -12, 1, 0 }, { 2, -7, 6, 90, 47, -12, 1, 1 }, + { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 4, 89, 50, -12, 1, 0 }, + { 2, -6, 3, 88, 52, -12, 0, 1 }, { 2, -6, 2, 87, 54, -12, 0, 1 }, + { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, 0, 85, 57, -12, 0, 1 }, + { 2, -5, -1, 84, 58, -11, 0, 1 }, { 2, -5, -2, 83, 60, -11, 0, 1 }, + { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 }, + { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 }, + { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 76, 69, -9, -1, 1 }, + { 1, -3, -6, 75, 70, -8, -2, 1 }, { 1, -2, -7, 74, 71, -8, -2, 1 }, + { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 71, 74, -7, -2, 1 }, + { 1, -2, -8, 70, 75, -6, -3, 1 }, { 1, -1, -9, 69, 76, -6, -3, 1 }, + { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 66, 79, -4, -4, 1 }, + { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 }, + { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 }, + { 1, 0, -11, 58, 84, -1, -5, 2 }, { 1, 0, -12, 57, 85, 0, -5, 2 }, + { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 54, 87, 2, -6, 2 }, + { 1, 0, -12, 52, 88, 3, -6, 2 }, { 0, 1, -12, 50, 89, 4, -6, 2 }, + { 0, 1, -12, 49, 90, 5, -7, 2 }, { 1, 1, -12, 47, 90, 6, -7, 2 }, + { 0, 1, -12, 46, 91, 7, -7, 2 }, { 1, 1, -12, 44, 92, 8, -8, 2 }, + { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 41, 93, 11, -8, 2 }, + { 0, 1, -12, 40, 93, 12, -8, 2 }, { 0, 2, -12, 38, 94, 13, -9, 2 }, + { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 35, 95, 15, -9, 2 }, + { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 }, + { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 }, + { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 }, +}; + +// Filters for interpolation (0.875-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = { + { 3, -8, 13, 112, 13, -8, 3, 0 }, { 2, -7, 12, 112, 15, -8, 3, -1 }, + { 3, -7, 10, 112, 17, -9, 3, -1 }, { 2, -6, 8, 112, 19, -9, 3, -1 }, + { 2, -6, 7, 112, 21, -10, 3, -1 }, { 2, -5, 6, 111, 22, -10, 3, -1 }, + { 2, -5, 4, 111, 24, -10, 3, -1 }, { 2, -4, 3, 110, 26, -11, 3, -1 }, + { 2, -4, 1, 110, 28, -11, 3, -1 }, { 2, -4, 0, 109, 30, -12, 4, -1 }, + { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 }, + { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 }, + { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 }, + { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 }, + { 1, 0, -9, 100, 48, -15, 4, -1 }, { 1, 0, -10, 99, 50, -15, 4, -1 }, + { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -11, 96, 55, -16, 4, -1 }, + { 0, 1, -12, 95, 57, -16, 4, -1 }, { 0, 2, -13, 93, 59, -16, 4, -1 }, + { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 90, 63, -16, 4, -1 }, + { 0, 2, -14, 88, 65, -16, 4, -1 }, { 0, 2, -15, 86, 67, -16, 4, 0 }, + { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 83, 71, -17, 4, 0 }, + { 0, 3, -16, 81, 73, -16, 3, 0 }, { 0, 3, -16, 79, 75, -16, 3, 0 }, + { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 75, 79, -16, 3, 0 }, + { 0, 3, -16, 73, 81, -16, 3, 0 }, { 0, 4, -17, 71, 83, -16, 3, 0 }, + { 0, 4, -17, 69, 84, -15, 3, 0 }, { 0, 4, -16, 67, 86, -15, 2, 0 }, + { -1, 4, -16, 65, 88, -14, 2, 0 }, { -1, 4, -16, 63, 90, -14, 2, 0 }, + { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 59, 93, -13, 2, 0 }, + { -1, 4, -16, 57, 95, -12, 1, 0 }, { -1, 4, -16, 55, 96, -11, 1, 0 }, + { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 50, 99, -10, 0, 1 }, + { -1, 4, -15, 48, 100, -9, 0, 1 }, { -1, 4, -15, 46, 101, -8, 0, 1 }, + { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 }, + { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 }, + { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 }, + { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 }, + { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -11, 26, 110, 3, -4, 2 }, + { -1, 3, -10, 24, 111, 4, -5, 2 }, { -1, 3, -10, 22, 111, 6, -5, 2 }, + { -1, 3, -10, 21, 112, 7, -6, 2 }, { -1, 3, -9, 19, 112, 8, -6, 2 }, + { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 }, +}; + +const int16_t av1_resize_filter_normative[( + 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = { +#if UPSCALE_NORMATIVE_TAPS == 8 + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, + { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, + { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, + { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, + { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, + { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, + { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, + { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, + { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, + { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, + { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, + { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, + { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, + { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, + { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, + { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, + { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, + { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, + { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, + { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, + { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, + { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, + { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, + { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, + { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, + { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, + { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, + { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, + { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, + { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, + { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, + { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, +#else +#error "Invalid value of UPSCALE_NORMATIVE_TAPS" +#endif // UPSCALE_NORMATIVE_TAPS == 8 +}; + +// Filters for interpolation (full-band) - no filtering for integer pixels +#define filteredinterp_filters1000 av1_resize_filter_normative + +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + +static const InterpKernel *choose_interp_filter(int in_length, int out_length) { + int out_length16 = out_length * 16; + if (out_length16 >= in_length * 16) + return filteredinterp_filters1000; + else if (out_length16 >= in_length * 13) + return filteredinterp_filters875; + else if (out_length16 >= in_length * 11) + return filteredinterp_filters750; + else if (out_length16 >= in_length * 9) + return filteredinterp_filters625; + else + return filteredinterp_filters500; +} + +static void interpolate_core(const uint8_t *const input, int in_length, + uint8_t *output, int out_length, + const int16_t *interp_filters, int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + uint8_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } +} + +static void interpolate(const uint8_t *const input, int in_length, + uint8_t *output, int out_length) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + interpolate_core(input, in_length, output, out_length, &interp_filters[0][0], + SUBPEL_TAPS); +} + +int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { + return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; +} + +static int32_t get_upscale_convolve_x0(int in_length, int out_length, + int32_t x_step_qn) { + const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS); + const int32_t x0 = + (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + + RS_SCALE_EXTRA_OFF - err / 2; + return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK); +} + +static void down2_symeven(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half. + const int16_t *filter = av1_down2_symeven_half_filter; + const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static void down2_symodd(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half - 1. + const int16_t *filter = av1_down2_symodd_half_filter; + const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static int get_down2_length(int length, int steps) { + for (int s = 0; s < steps; ++s) length = (length + 1) >> 1; + return length; +} + +static int get_down2_steps(int in_length, int out_length) { + int steps = 0; + int proj_in_length; + while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { + ++steps; + in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } + } + return steps; +} + +static void resize_multistep(const uint8_t *const input, int length, + uint8_t *output, int olength, uint8_t *otmp) { + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + const int steps = get_down2_steps(length, olength); + + if (steps > 0) { + uint8_t *out = NULL; + int filteredlength = length; + + assert(otmp != NULL); + uint8_t *otmp2 = otmp + get_down2_length(length, 1); + for (int s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint8_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + down2_symodd(in, filteredlength, out); + else + down2_symeven(in, filteredlength, out); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + interpolate(out, filteredlength, output, olength); + } + } else { + interpolate(input, length, output, olength); + } +} + +static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +bool av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride) { + int i; + bool mem_status = true; + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height); + uint8_t *tmpbuf = + (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height)); + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) { + mem_status = false; + goto Error; + } + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) + resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2, + tmpbuf); + for (i = 0; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(tmpbuf); + aom_free(arrbuf); + aom_free(arrbuf2); + return mem_status; +} + +static bool upscale_normative_rect(const uint8_t *const input, int height, + int width, int in_stride, uint8_t *output, + int height2, int width2, int out_stride, + int x_step_qn, int x0_qn, int pad_left, + int pad_right) { + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + assert(height2 == height); + + // Extend the left/right pixels of the tile column if needed + // (either because we can't sample from other tiles, or because we're at + // a frame edge). + // Save the overwritten pixels into tmp_left and tmp_right. + // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra + // column of border pixels compared to what we'd naively think. + const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; + uint8_t *tmp_left = + NULL; // Silence spurious "may be used uninitialized" warnings + uint8_t *tmp_right = NULL; + uint8_t *const in_tl = (uint8_t *)(input - border_cols); // Cast off 'const' + uint8_t *const in_tr = (uint8_t *)(input + width); + if (pad_left) { + tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); + if (!tmp_left) return false; + for (int i = 0; i < height; i++) { + memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols); + memset(in_tl + i * in_stride, input[i * in_stride], border_cols); + } + } + if (pad_right) { + tmp_right = + (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); + if (!tmp_right) { + aom_free(tmp_left); + return false; + } + for (int i = 0; i < height; i++) { + memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols); + memset(in_tr + i * in_stride, input[i * in_stride + width - 1], + border_cols); + } + } + + av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2, + height2, &av1_resize_filter_normative[0][0], x0_qn, + x_step_qn); + + // Restore the left/right border pixels + if (pad_left) { + for (int i = 0; i < height; i++) { + memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols); + } + aom_free(tmp_left); + } + if (pad_right) { + for (int i = 0; i < height; i++) { + memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols); + } + aom_free(tmp_right); + } + return true; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_interpolate_core(const uint16_t *const input, int in_length, + uint16_t *output, int out_length, int bd, + const int16_t *interp_filters, + int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + uint16_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } +} + +static void highbd_interpolate(const uint16_t *const input, int in_length, + uint16_t *output, int out_length, int bd) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + highbd_interpolate_core(input, in_length, output, out_length, bd, + &interp_filters[0][0], SUBPEL_TAPS); +} + +static void highbd_down2_symeven(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half. + static const int16_t *filter = av1_down2_symeven_half_filter; + const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_down2_symodd(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half - 1. + static const int16_t *filter = av1_down2_symodd_half_filter; + const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_resize_multistep(const uint16_t *const input, int length, + uint16_t *output, int olength, + uint16_t *otmp, int bd) { + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + const int steps = get_down2_steps(length, olength); + + if (steps > 0) { + uint16_t *out = NULL; + int filteredlength = length; + + assert(otmp != NULL); + uint16_t *otmp2 = otmp + get_down2_length(length, 1); + for (int s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint16_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + highbd_down2_symodd(in, filteredlength, out, bd); + else + highbd_down2_symeven(in, filteredlength, out, bd); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + highbd_interpolate(out, filteredlength, output, olength, bd); + } + } else { + highbd_interpolate(input, length, output, olength, bd); + } +} + +static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd) { + int i; + uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height); + uint16_t *tmpbuf = + (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height)); + uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height); + uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) + goto Error; + for (i = 0; i < height; ++i) { + highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, + intbuf + width2 * i, width2, tmpbuf, bd); + } + for (i = 0; i < width2; ++i) { + highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf); + highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd); + highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, + arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(tmpbuf); + aom_free(arrbuf); + aom_free(arrbuf2); +} + +static bool highbd_upscale_normative_rect(const uint8_t *const input, + int height, int width, int in_stride, + uint8_t *output, int height2, + int width2, int out_stride, + int x_step_qn, int x0_qn, + int pad_left, int pad_right, int bd) { + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + assert(height2 == height); + + // Extend the left/right pixels of the tile column if needed + // (either because we can't sample from other tiles, or because we're at + // a frame edge). + // Save the overwritten pixels into tmp_left and tmp_right. + // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra + // column of border pixels compared to what we'd naively think. + const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; + const int border_size = border_cols * sizeof(uint16_t); + uint16_t *tmp_left = + NULL; // Silence spurious "may be used uninitialized" warnings + uint16_t *tmp_right = NULL; + uint16_t *const input16 = CONVERT_TO_SHORTPTR(input); + uint16_t *const in_tl = input16 - border_cols; + uint16_t *const in_tr = input16 + width; + if (pad_left) { + tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); + if (!tmp_left) return false; + for (int i = 0; i < height; i++) { + memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size); + aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols); + } + } + if (pad_right) { + tmp_right = + (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); + if (!tmp_right) { + aom_free(tmp_left); + return false; + } + for (int i = 0; i < height; i++) { + memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size); + aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1], + border_cols); + } + } + + av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride, + CONVERT_TO_SHORTPTR(output), out_stride, width2, + height2, &av1_resize_filter_normative[0][0], + x0_qn, x_step_qn, bd); + + // Restore the left/right border pixels + if (pad_left) { + for (int i = 0; i < height; i++) { + memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size); + } + aom_free(tmp_left); + } + if (pad_right) { + for (int i = 0; i < height; i++) { + memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size); + } + aom_free(tmp_right); + } + return true; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride)) + abort(); + if (!av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride)) + abort(); + if (!av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride)) + abort(); +} + +bool av1_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride)) + return false; + if (!av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride)) + return false; + if (!av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride)) + return false; + return true; +} + +bool av1_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride)) + return false; + if (!av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride)) + return false; + if (!av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride)) + return false; + return true; +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride, bd); +} + +void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride, bd); +} + +void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride, bd); + av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase_scaler, + const int num_planes) { + assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || + filter == EIGHTTAP_REGULAR); + const InterpKernel *const kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter].filter_ptr; + + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const uint8_t *src_buffer = src->buffers[i]; + const int src_stride = src->strides[is_uv]; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + uint8_t *dst_buffer = dst->buffers[i]; + const int dst_stride = dst->strides[is_uv]; + for (int y = 0; y < dst_h; y += 16) { + const int y_q4 = + src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler; + for (int x = 0; x < dst_w; x += 16) { + const int x_q4 = + src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = + src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w; + uint8_t *dst_ptr = dst_buffer + y * dst_stride + x; + + // Width and height of the actual working area. + const int work_w = AOMMIN(16, dst_w - x); + const int work_h = AOMMIN(16, dst_h - y); + // SIMD versions of aom_scaled_2d() have some trouble handling + // nonstandard sizes, so fall back on the C version to handle borders. + if (work_w != 16 || work_h != 16) { + aom_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, work_w, work_h); + } else { + aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16, 16); + } + } + } + } + aom_extend_frame_borders(dst, num_planes); +} + +bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + const int num_planes) { + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; +#if CONFIG_AV1_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv], bd); + } else if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], + dst->strides[is_uv])) { + return false; + } +#else + (void)bd; + if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv])) + return false; +#endif + } + aom_extend_frame_borders(dst, num_planes); + return true; +} + +void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int plane, int rows) { + const int is_uv = (plane > 0); + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x); + const int upscaled_plane_width = + ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); + const int superres_denom = cm->superres_scale_denominator; + + TileInfo tile_col; + const int32_t x_step_qn = av1_get_upscale_convolve_step( + downscaled_plane_width, upscaled_plane_width); + int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width, + upscaled_plane_width, x_step_qn); + + for (int j = 0; j < cm->tiles.cols; j++) { + av1_tile_set_col(&tile_col, cm, j); + // Determine the limits of this tile column in both the source + // and destination images. + // Note: The actual location which we start sampling from is + // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases + // by exactly dst_width * (x_step_qn/2^14) pixels each iteration. + const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x); + const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x); + const int src_width = downscaled_x1 - downscaled_x0; + + const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR; + int upscaled_x1; + if (j == cm->tiles.cols - 1) { + // Note that we can't just use AOMMIN here - due to rounding, + // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than + // upscaled_plane_width. + upscaled_x1 = upscaled_plane_width; + } else { + upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR; + } + + const uint8_t *const src_ptr = src + downscaled_x0; + uint8_t *const dst_ptr = dst + upscaled_x0; + const int dst_width = upscaled_x1 - upscaled_x0; + + const int pad_left = (j == 0); + const int pad_right = (j == cm->tiles.cols - 1); + + bool success; +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params->use_highbitdepth) + success = highbd_upscale_normative_rect( + src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, + dst_stride, x_step_qn, x0_qn, pad_left, pad_right, + cm->seq_params->bit_depth); + else + success = upscale_normative_rect(src_ptr, rows, src_width, src_stride, + dst_ptr, rows, dst_width, dst_stride, + x_step_qn, x0_qn, pad_left, pad_right); +#else + success = upscale_normative_rect(src_ptr, rows, src_width, src_stride, + dst_ptr, rows, dst_width, dst_stride, + x_step_qn, x0_qn, pad_left, pad_right); +#endif + if (!success) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error upscaling frame"); + } + // Update the fractional pixel offset to prepare for the next tile column. + x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS); + } +} + +void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + const int num_planes = av1_num_planes(cm); + for (int i = 0; i < num_planes; ++i) { + const int is_uv = (i > 0); + av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], i, + src->crop_heights[is_uv]); + } + + aom_extend_frame_borders(dst, num_planes); +} + +YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( + AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + const InterpFilter filter, const int phase, const bool use_optimized_scaler, + const bool for_psnr, const int border_in_pixels, + const int num_pyramid_levels) { + // If scaling is performed for the sole purpose of calculating PSNR, then our + // target dimensions are superres upscaled width/height. Otherwise our target + // dimensions are coded width/height. + const int scaled_width = for_psnr ? cm->superres_upscaled_width : cm->width; + const int scaled_height = + for_psnr ? cm->superres_upscaled_height : cm->height; + const bool scaling_required = (scaled_width != unscaled->y_crop_width) || + (scaled_height != unscaled->y_crop_height); + + if (scaling_required) { + const int num_planes = av1_num_planes(cm); + const SequenceHeader *seq_params = cm->seq_params; + + // Reallocate the frame buffer based on the target dimensions when scaling + // is required. + if (aom_realloc_frame_buffer( + scaled, scaled_width, scaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, + num_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled buffer"); + + bool has_optimized_scaler = av1_has_optimized_scaler( + unscaled->y_crop_width, unscaled->y_crop_height, scaled_width, + scaled_height); + if (num_planes > 1) { + has_optimized_scaler = has_optimized_scaler && + av1_has_optimized_scaler(unscaled->uv_crop_width, + unscaled->uv_crop_height, + scaled->uv_crop_width, + scaled->uv_crop_height); + } + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_optimized_scaler && has_optimized_scaler && + cm->seq_params->bit_depth == AOM_BITS_8) { + av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); + } else { + if (!av1_resize_and_extend_frame_nonnormative( + unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffers during resize"); + } +#else + if (use_optimized_scaler && has_optimized_scaler) { + av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); + } else { + if (!av1_resize_and_extend_frame_nonnormative( + unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffers during resize"); + } +#endif + return scaled; + } + return unscaled; +} + +// Calculates the scaled dimension given the original dimension and the scale +// denominator. +static void calculate_scaled_size_helper(int *dim, int denom) { + if (denom != SCALE_NUMERATOR) { + // We need to ensure the constraint in "Appendix A" of the spec: + // * FrameWidth is greater than or equal to 16 + // * FrameHeight is greater than or equal to 16 + // For this, we clamp the downscaled dimension to at least 16. One + // exception: if original dimension itself was < 16, then we keep the + // downscaled dimension to be same as the original, to ensure that resizing + // is valid. + const int min_dim = AOMMIN(16, *dim); + // Use this version if we need *dim to be even + // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom); + // *width <<= 1; + *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom); + *dim = AOMMAX(*dim, min_dim); + } +} + +void av1_calculate_scaled_size(int *width, int *height, int resize_denom) { + calculate_scaled_size_helper(width, resize_denom); + calculate_scaled_size_helper(height, resize_denom); +} + +void av1_calculate_scaled_superres_size(int *width, int *height, + int superres_denom) { + (void)height; + calculate_scaled_size_helper(width, superres_denom); +} + +void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) { + if (denom != SCALE_NUMERATOR) { + // Note: av1_calculate_scaled_superres_size() rounds *up* after division + // when the resulting dimensions are odd. So here, we round *down*. + *width = *width * denom / SCALE_NUMERATOR; + (void)height; + } +} + +// Copy only the config data from 'src' to 'dst'. +static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const dst) { + dst->bit_depth = src->bit_depth; + dst->color_primaries = src->color_primaries; + dst->transfer_characteristics = src->transfer_characteristics; + dst->matrix_coefficients = src->matrix_coefficients; + dst->monochrome = src->monochrome; + dst->chroma_sample_position = src->chroma_sample_position; + dst->color_range = src->color_range; +} + +// TODO(afergs): Look for in-place upscaling +// TODO(afergs): aom_ vs av1_ functions? Which can I use? +// Upscale decoded image. +void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, + int num_pyramid_levels) { + const int num_planes = av1_num_planes(cm); + if (!av1_superres_scaled(cm)) return; + const SequenceHeader *const seq_params = cm->seq_params; + const int byte_alignment = cm->features.byte_alignment; + + YV12_BUFFER_CONFIG copy_buffer; + memset(©_buffer, 0, sizeof(copy_buffer)); + + YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf; + + const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3); + if (aom_alloc_frame_buffer( + ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate copy buffer for superres upscaling"); + + // Copy function assumes the frames are the same size. + // Note that it does not copy YV12_BUFFER_CONFIG config data. + aom_yv12_copy_frame(frame_to_show, ©_buffer, num_planes); + + assert(copy_buffer.y_crop_width == aligned_width); + assert(copy_buffer.y_crop_height == cm->height); + + // Realloc the current frame buffer at a higher resolution in place. + if (pool != NULL) { + // Use callbacks if on the decoder. + aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer; + aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb; + aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb; + void *cb_priv = pool->cb_priv; + + lock_buffer_pool(pool); + // Realloc with callback does not release the frame buffer - release first. + if (release_fb_cb(cb_priv, fb)) { + unlock_buffer_pool(pool); + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to free current frame buffer before superres upscaling"); + } + // aom_realloc_frame_buffer() leaves config data for frame_to_show intact + if (aom_realloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, + num_pyramid_levels, 0)) { + unlock_buffer_pool(pool); + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate current frame buffer for superres upscaling"); + } + unlock_buffer_pool(pool); + } else { + // Make a copy of the config data for frame_to_show in copy_buffer + copy_buffer_config(frame_to_show, ©_buffer); + + // Don't use callbacks on the encoder. + // aom_alloc_frame_buffer() clears the config data for frame_to_show + if (aom_alloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, num_pyramid_levels, 0)) + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate current frame buffer for superres upscaling"); + + // Restore config data back to frame_to_show + copy_buffer_config(©_buffer, frame_to_show); + } + // TODO(afergs): verify frame_to_show is correct after realloc + // encoder: + // decoder: + + assert(frame_to_show->y_crop_width == cm->superres_upscaled_width); + assert(frame_to_show->y_crop_height == cm->superres_upscaled_height); + + // Scale up and back into frame_to_show. + assert(frame_to_show->y_crop_width != cm->width); + av1_upscale_normative_and_extend_frame(cm, ©_buffer, frame_to_show); + + // Free the copy buffer + aom_free_frame_buffer(©_buffer); +} diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h new file mode 100644 index 0000000000..0ba3108f72 --- /dev/null +++ b/third_party/aom/av1/common/resize.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RESIZE_H_ +#define AOM_AV1_COMMON_RESIZE_H_ + +#include +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +bool av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride); +// TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from +// av1/exports_com and delete this function. +void av1_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +bool av1_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +bool av1_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); + +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd); +void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); + +void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int plane, int rows); +void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( + AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + const InterpFilter filter, const int phase, const bool use_optimized_scaler, + const bool for_psnr, const int border_in_pixels, + const int num_pyramid_levels); + +bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + const int num_planes); + +// Calculates the scaled dimensions from the given original dimensions and the +// resize scale denominator. +void av1_calculate_scaled_size(int *width, int *height, int resize_denom); + +// Similar to above, but calculates scaled dimensions after superres from the +// given original dimensions and superres scale denominator. +void av1_calculate_scaled_superres_size(int *width, int *height, + int superres_denom); + +// Inverse of av1_calculate_scaled_superres_size() above: calculates the +// original dimensions from the given scaled dimensions and the scale +// denominator. +void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); + +void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, + int num_pyramid_levels); + +// Returns 1 if a superres upscaled frame is scaled and 0 otherwise. +static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { + // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling + // required even though cm->superres_scale_denominator != SCALE_NUMERATOR. + // So, the following check is more accurate. + return (cm->width != cm->superres_upscaled_width); +} + +// The optimized scaler av1_resize_and_extend_frame() can only handle scaling +// ratios >= 1/4 and <= 16. See comment in aom_convolve8_c() for detail. +// Visual assessment shows that if the scaling ratio or its reciprocal is not a +// multiple of 1/16, there are some artifacts in the output of the optimized +// scaler, especially on lines, due to non-exact ratio representation. SSSE3 +// and NEON have a specialized 3/4 version of av1_resize_and_extend_frame() +// that does not have this issue. +// +// Use the non-normative scaler av1_resize_and_extend_frame_nonnormative() +// for other scaling ratios. +static INLINE bool av1_has_optimized_scaler(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + bool has_optimized_scaler = + (dst_width * 4 >= src_width && dst_height * 4 >= src_height) && + (dst_width <= src_width * 16 && dst_height <= src_height * 16) && + (16 * dst_width % src_width == 0) && (16 * src_width % dst_width == 0) && + (16 * dst_height % src_height == 0) && + (16 * src_height % dst_height == 0); +#if HAVE_SSSE3 || HAVE_NEON + has_optimized_scaler = + has_optimized_scaler || + (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); +#endif + return has_optimized_scaler; +} + +#define UPSCALE_NORMATIVE_TAPS 8 +extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS] + [UPSCALE_NORMATIVE_TAPS]; + +int32_t av1_get_upscale_convolve_step(int in_length, int out_length); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RESIZE_H_ diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c new file mode 100644 index 0000000000..0be126fa65 --- /dev/null +++ b/third_party/aom/av1/common/restoration.c @@ -0,0 +1,1494 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "av1/common/thread_common.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" + +#include "aom_ports/mem.h" + +// The 's' values are calculated based on original 'r' and 'e' values in the +// spec using GenSgrprojVtable(). +// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid). +const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = { + { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } }, + { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } }, + { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } }, + { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } }, + { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } }, + { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } }, + { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } }, + { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } }, +}; + +void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w, + int *plane_h) { + int ss_x = is_uv && cm->seq_params->subsampling_x; + int ss_y = is_uv && cm->seq_params->subsampling_y; + *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); + *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y); +} + +// Count horizontal or vertical units in a plane (use a width or height for +// plane_size, respectively). We basically want to divide the plane size by the +// size of a restoration unit. Rather than rounding up unconditionally as you +// might expect, we round to nearest, which models the way a right or bottom +// restoration unit can extend to up to 150% its normal width or height. +// +// The max with 1 is to deal with small frames, which may be smaller than +// half of an LR unit in size. +int av1_lr_count_units(int unit_size, int plane_size) { + return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1); +} + +void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi, + int is_uv) { + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + const int unit_size = rsi->restoration_unit_size; + const int horz_units = av1_lr_count_units(unit_size, plane_w); + const int vert_units = av1_lr_count_units(unit_size, plane_h); + + rsi->num_rest_units = horz_units * vert_units; + rsi->horz_units = horz_units; + rsi->vert_units = vert_units; + + aom_free(rsi->unit_info); + CHECK_MEM_ERROR(cm, rsi->unit_info, + (RestorationUnitInfo *)aom_memalign( + 16, sizeof(*rsi->unit_info) * rsi->num_rest_units)); +} + +void av1_free_restoration_struct(RestorationInfo *rst_info) { + aom_free(rst_info->unit_info); + rst_info->unit_info = NULL; +} + +#if 0 +// Pair of values for each sgrproj parameter: +// Index 0 corresponds to r[0], e[0] +// Index 1 corresponds to r[1], e[1] +int sgrproj_mtable[SGRPROJ_PARAMS][2]; + +static void GenSgrprojVtable(void) { + for (int i = 0; i < SGRPROJ_PARAMS; ++i) { + const sgr_params_type *const params = &av1_sgr_params[i]; + for (int j = 0; j < 2; ++j) { + const int e = params->e[j]; + const int r = params->r[j]; + if (r == 0) { // filter is disabled + sgrproj_mtable[i][j] = -1; // mark invalid + } else { // filter is enabled + const int n = (2 * r + 1) * (2 * r + 1); + const int n2e = n * n * e; + assert(n2e != 0); + sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e); + } + } + } +} +#endif + +void av1_loop_restoration_precal(void) { +#if 0 + GenSgrprojVtable(); +#endif +} + +static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert) { + uint8_t *data_p; + int i; + for (i = 0; i < height; ++i) { + data_p = data + i * stride; + memset(data_p - border_horz, data_p[0], border_horz); + memset(data_p + width, data_p[width - 1], border_horz); + } + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, width + 2 * border_horz); + } + for (i = height; i < height + border_vert; ++i) { + memcpy(data_p + i * stride, data_p + (height - 1) * stride, + width + 2 * border_horz); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void extend_frame_highbd(uint16_t *data, int width, int height, + int stride, int border_horz, int border_vert) { + uint16_t *data_p; + int i, j; + for (i = 0; i < height; ++i) { + data_p = data + i * stride; + for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0]; + for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1]; + } + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, + (width + 2 * border_horz) * sizeof(uint16_t)); + } + for (i = height; i < height + border_vert; ++i) { + memcpy(data_p + i * stride, data_p + (height - 1) * stride, + (width + 2 * border_horz) * sizeof(uint16_t)); + } +} + +static void copy_rest_unit_highbd(int width, int height, const uint16_t *src, + int src_stride, uint16_t *dst, + int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); +} +#endif + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, + border_horz, border_vert); + return; + } +#endif + (void)highbd; + extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); +} + +static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width); +} + +static void copy_rest_unit(int width, int height, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), dst_stride); + return; + } +#endif + (void)highbd; + copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride); +} + +#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d)) + +// With striped loop restoration, the filtering for each 64-pixel stripe gets +// most of its input from the output of CDEF (stored in data8), but we need to +// fill out a border of 3 pixels above/below the stripe according to the +// following rules: +// +// * At the top and bottom of the frame, we copy the outermost row of CDEF +// pixels three times. This extension is done by a call to av1_extend_frame() +// at the start of the loop restoration process, so the value of +// copy_above/copy_below doesn't strictly matter. +// +// * All other boundaries are stripe boundaries within the frame. In that case, +// we take 2 rows of deblocked pixels and extend them to 3 rows of context. +static void get_stripe_boundary_info(const RestorationTileLimits *limits, + int plane_w, int plane_h, int ss_y, + int *copy_above, int *copy_below) { + (void)plane_w; + + *copy_above = 1; + *copy_below = 1; + + const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; + + const int first_stripe_in_plane = (limits->v_start == 0); + const int this_stripe_height = + full_stripe_height - (first_stripe_in_plane ? runit_offset : 0); + const int last_stripe_in_plane = + (limits->v_start + this_stripe_height >= plane_h); + + if (first_stripe_in_plane) *copy_above = 0; + if (last_stripe_in_plane) *copy_below = 0; +} + +// Overwrite the border pixels around a processing stripe so that the conditions +// listed above get_stripe_boundary_info() are preserved. +// We save the pixels which get overwritten into a temporary buffer, so that +// they can be restored by restore_processing_stripe_boundary() after we've +// processed the stripe. +// +// limits gives the rectangular limits of the remaining stripes for the current +// restoration unit. rsb is the stored stripe boundaries (taken from either +// deblock or CDEF output as necessary). +static void setup_processing_stripe_boundary( + const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb, + int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride, + RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) { + // Offsets within the line buffers. The buffer logically starts at column + // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ) + // has column x0 in the buffer. + const int buf_stride = rsb->stripe_boundary_stride; + const int buf_x0_off = limits->h_start; + const int line_width = + (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; + const int line_size = line_width << use_highbd; + + const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; + + // Replace RESTORATION_BORDER pixels above the top of the stripe + // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above + // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by + // duplicating the topmost of the 2 lines (see the AOMMAX call when + // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1). + if (!opt) { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + for (int i = -RESTORATION_BORDER; i < 0; ++i) { + const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0); + const int buf_off = buf_x0_off + buf_row * buf_stride; + const uint8_t *buf = + rsb->stripe_boundary_above + (buf_off << use_highbd); + uint8_t *dst8 = data8_tl + i * data_stride; + // Save old pixels, then replace with data from stripe_boundary_above + memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], + REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), buf, line_size); + } + } + + // Replace RESTORATION_BORDER pixels below the bottom of the stripe. + // The second buffer row is repeated, so src_row gets the values 0, 1, 1 + // for i = 0, 1, 2. + if (copy_below) { + const int stripe_end = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; + + for (int i = 0; i < RESTORATION_BORDER; ++i) { + const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1); + const int buf_off = buf_x0_off + buf_row * buf_stride; + const uint8_t *src = + rsb->stripe_boundary_below + (buf_off << use_highbd); + + uint8_t *dst8 = data8_bl + i * data_stride; + // Save old pixels, then replace with data from stripe_boundary_below + memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), src, line_size); + } + } + } else { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + // Only save and overwrite i=-RESTORATION_BORDER line. + uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; + // Save old pixels, then replace with data from stripe_boundary_above + memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), + REAL_PTR(use_highbd, + data8_tl + (-RESTORATION_BORDER + 1) * data_stride), + line_size); + } + + if (copy_below) { + const int stripe_end = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; + + // Only save and overwrite i=2 line. + uint8_t *dst8 = data8_bl + 2 * data_stride; + // Save old pixels, then replace with data from stripe_boundary_below + memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), + REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size); + } + } +} + +// Once a processing stripe is finished, this function sets the boundary +// pixels which were overwritten by setup_processing_stripe_boundary() +// back to their original values +static void restore_processing_stripe_boundary( + const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs, + int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above, + int copy_below, int opt) { + const int line_width = + (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; + const int line_size = line_width << use_highbd; + + const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; + + if (!opt) { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + for (int i = -RESTORATION_BORDER; i < 0; ++i) { + uint8_t *dst8 = data8_tl + i * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), + rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size); + } + } + + if (copy_below) { + const int stripe_bottom = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; + + for (int i = 0; i < RESTORATION_BORDER; ++i) { + if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break; + + uint8_t *dst8 = data8_bl + i * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size); + } + } + } else { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + // Only restore i=-RESTORATION_BORDER line. + uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size); + } + + if (copy_below) { + const int stripe_bottom = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; + + // Only restore i=2 line. + if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) { + uint8_t *dst8 = data8_bl + 2 * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size); + } + } + } +} + +static void wiener_filter_stripe(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + (void)tmpbuf; + (void)bit_depth; + (void)error_info; + assert(bit_depth == 8); + const WienerConvolveParams conv_params = get_conv_params_wiener(8); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); + const uint8_t *src_p = src + j; + uint8_t *dst_p = dst + j; + av1_wiener_convolve_add_src( + src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16, + rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params); + } +} + +/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1) + over the input. The window is of size (2r + 1)x(2r + 1), and we + specialize to r = 1, 2, 3. A default function is used for r > 3. + + Each loop follows the same format: We keep a window's worth of input + in individual variables and select data out of that as appropriate. +*/ +static void boxsum1(int32_t *src, int width, int height, int src_stride, + int sqr, int32_t *dst, int dst_stride) { + int i, j, a, b, c; + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + // Vertical sum over 3-pixel regions, from src into dst. + if (!sqr) { + for (j = 0; j < width; ++j) { + a = src[j]; + b = src[src_stride + j]; + c = src[2 * src_stride + j]; + + dst[j] = a + b; + for (i = 1; i < height - 2; ++i) { + // Loop invariant: At the start of each iteration, + // a = src[(i - 1) * src_stride + j] + // b = src[(i ) * src_stride + j] + // c = src[(i + 1) * src_stride + j] + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = src[(i + 2) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c; + dst[(i + 1) * dst_stride + j] = b + c; + } + } else { + for (j = 0; j < width; ++j) { + a = src[j] * src[j]; + b = src[src_stride + j] * src[src_stride + j]; + c = src[2 * src_stride + j] * src[2 * src_stride + j]; + + dst[j] = a + b; + for (i = 1; i < height - 2; ++i) { + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c; + dst[(i + 1) * dst_stride + j] = b + c; + } + } + + // Horizontal sum over 3-pixel regions of dst + for (i = 0; i < height; ++i) { + a = dst[i * dst_stride]; + b = dst[i * dst_stride + 1]; + c = dst[i * dst_stride + 2]; + + dst[i * dst_stride] = a + b; + for (j = 1; j < width - 2; ++j) { + // Loop invariant: At the start of each iteration, + // a = src[i * src_stride + (j - 1)] + // b = src[i * src_stride + (j )] + // c = src[i * src_stride + (j + 1)] + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = dst[i * dst_stride + (j + 2)]; + } + dst[i * dst_stride + j] = a + b + c; + dst[i * dst_stride + (j + 1)] = b + c; + } +} + +static void boxsum2(int32_t *src, int width, int height, int src_stride, + int sqr, int32_t *dst, int dst_stride) { + int i, j, a, b, c, d, e; + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + // Vertical sum over 5-pixel regions, from src into dst. + if (!sqr) { + for (j = 0; j < width; ++j) { + a = src[j]; + b = src[src_stride + j]; + c = src[2 * src_stride + j]; + d = src[3 * src_stride + j]; + e = src[4 * src_stride + j]; + + dst[j] = a + b + c; + dst[dst_stride + j] = a + b + c + d; + for (i = 2; i < height - 3; ++i) { + // Loop invariant: At the start of each iteration, + // a = src[(i - 2) * src_stride + j] + // b = src[(i - 1) * src_stride + j] + // c = src[(i ) * src_stride + j] + // d = src[(i + 1) * src_stride + j] + // e = src[(i + 2) * src_stride + j] + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = src[(i + 3) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[(i + 1) * dst_stride + j] = b + c + d + e; + dst[(i + 2) * dst_stride + j] = c + d + e; + } + } else { + for (j = 0; j < width; ++j) { + a = src[j] * src[j]; + b = src[src_stride + j] * src[src_stride + j]; + c = src[2 * src_stride + j] * src[2 * src_stride + j]; + d = src[3 * src_stride + j] * src[3 * src_stride + j]; + e = src[4 * src_stride + j] * src[4 * src_stride + j]; + + dst[j] = a + b + c; + dst[dst_stride + j] = a + b + c + d; + for (i = 2; i < height - 3; ++i) { + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[(i + 1) * dst_stride + j] = b + c + d + e; + dst[(i + 2) * dst_stride + j] = c + d + e; + } + } + + // Horizontal sum over 5-pixel regions of dst + for (i = 0; i < height; ++i) { + a = dst[i * dst_stride]; + b = dst[i * dst_stride + 1]; + c = dst[i * dst_stride + 2]; + d = dst[i * dst_stride + 3]; + e = dst[i * dst_stride + 4]; + + dst[i * dst_stride] = a + b + c; + dst[i * dst_stride + 1] = a + b + c + d; + for (j = 2; j < width - 3; ++j) { + // Loop invariant: At the start of each iteration, + // a = src[i * src_stride + (j - 2)] + // b = src[i * src_stride + (j - 1)] + // c = src[i * src_stride + (j )] + // d = src[i * src_stride + (j + 1)] + // e = src[i * src_stride + (j + 2)] + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = dst[i * dst_stride + (j + 3)]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[i * dst_stride + (j + 1)] = b + c + d + e; + dst[i * dst_stride + (j + 2)] = c + d + e; + } +} + +static void boxsum(int32_t *src, int width, int height, int src_stride, int r, + int sqr, int32_t *dst, int dst_stride) { + if (r == 1) + boxsum1(src, width, height, src_stride, sqr, dst, dst_stride); + else if (r == 2) + boxsum2(src, width, height, src_stride, sqr, dst, dst_stride); + else + assert(0 && "Invalid value of r in self-guided filter"); +} + +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { + if (params->r[0] == 0) { + xq[0] = 0; + xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1]; + } else if (params->r[1] == 0) { + xq[0] = xqd[0]; + xq[1] = 0; + } else { + xq[0] = xqd[0]; + xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1]; + } +} + +const int32_t av1_x_by_xplus1[256] = { + // Special case: Map 0 -> 1 (corresponding to a value of 1/256) + // instead of 0. See comments in selfguided_restoration_internal() for why + 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, + 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, + 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, + 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 256, +}; + +const int32_t av1_one_by_x[MAX_NELEM] = { + 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, + 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, +}; + +static void calculate_intermediate_result(int32_t *dgd, int width, int height, + int dgd_stride, int bit_depth, + int sgr_params_idx, int radius_idx, + int pass, int32_t *A, int32_t *B) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + const int step = pass == 0 ? 1 : 2; + int i, j; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, + // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. + for (i = -1; i < height + 1; i += step) { + for (j = -1; j < width + 1; ++j) { + const int k = i * buf_stride + j; + const int n = (2 * r + 1) * (2 * r + 1); + + // a < 2^16 * n < 2^22 regardless of bit depth + uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); + // b < 2^8 * n < 2^14 regardless of bit depth + uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); + + // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, + // and p itself satisfies p < 2^14 * n^2 < 2^26. + // This bound on p is due to: + // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances + // + // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. + // This is an artefact of rounding, and can only happen if all pixels + // are (almost) identical, so in this case we saturate to p=0. + uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; + + const uint32_t s = params->s[radius_idx]; + + // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 + // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 + // (this holds even after accounting for the rounding in s) + const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); + + // Note: We have to be quite careful about the value of A[k]. + // This is used as a blend factor between individual pixel values and the + // local mean. So it logically has a range of [0, 256], including both + // endpoints. + // + // This is a pain for hardware, as we'd like something which can be stored + // in exactly 8 bits. + // Further, in the calculation of B[k] below, if z == 0 and r == 2, + // then A[k] "should be" 0. But then we can end up setting B[k] to a value + // slightly above 2^(8 + bit depth), due to rounding in the value of + // av1_one_by_x[25-1]. + // + // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. + // This fixes the above issues (256 - A[k] fits in a uint8, and we can't + // overflow), without significantly affecting the final result: z == 0 + // implies that the image is essentially "flat", so the local mean and + // individual pixel values are very similar. + // + // Note that saturating on the other side, ie. requring A[k] <= 255, + // would be a bad idea, as that corresponds to the case where the image + // is very variable, when we want to preserve the local pixel value as + // much as possible. + A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] + + // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, + // av1_one_by_x[n - 1] = round(2^12 / n) + // => the product here is < 2^(20 + bit_depth) <= 2^32, + // and B[k] is set to a value < 2^(8 + bit depth) + // This holds even with the rounding in av1_one_by_x and in the overall + // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. + B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * + (uint32_t)B[k] * + (uint32_t)av1_one_by_x[n - 1], + SGRPROJ_RECIP_BITS); + } + } +} + +static void selfguided_restoration_fast_internal( + int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 1, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Use the A[] and B[] arrays to calculate the filtered image + (void)r; + assert(r == 2); + for (i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 5; + const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } else { // odd row + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 4; + const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5; + const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } + } +} + +static void selfguided_restoration_internal(int32_t *dgd, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, + int radius_idx) { + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 0, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Use the A[] and B[] arrays to calculate the filtered image + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = + (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) * + 4 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 3; + const int32_t b = + (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) * + 4 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 3; + const int32_t v = a * dgd[l] + b; + dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } +} + +int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; + const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; + int32_t *dgd32 = + dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + + if (highbd) { + const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8); + for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { + for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { + dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j]; + } + } + } else { + for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { + for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { + dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j]; + } + } + } + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + if (params->r[0] > 0) + selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, + flt0, flt_stride, bit_depth, + sgr_params_idx, 0); + if (params->r[1] > 0) + selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, + flt_stride, bit_depth, sgr_params_idx, 1); + return 0; +} + +int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + + const int ret = av1_selfguided_restoration_c( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + if (ret != 0) return ret; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int k = i * width + j; + uint8_t *dst8ij = dst8 + i * dst_stride + j; + const uint8_t *dat8ij = dat8 + i * stride + j; + + const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij; + const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS; + int32_t v = u << SGRPROJ_PRJ_BITS; + // If params->r == 0 then we skipped the filtering in + // av1_selfguided_restoration_c, i.e. flt[k] == u + if (params->r[0] > 0) v += xq[0] * (flt0[k] - u); + if (params->r[1] > 0) v += xq[1] * (flt1[k] - u); + const int16_t w = + (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + const uint16_t out = clip_pixel_highbd(w, bit_depth); + if (highbd) + *CONVERT_TO_SHORTPTR(dst8ij) = out; + else + *dst8ij = (uint8_t)out; + } + } + return 0; +} + +static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + (void)bit_depth; + assert(bit_depth == 8); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, stripe_width - j); + if (av1_apply_selfguided_restoration( + src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, + 0) != 0) { + aom_internal_error( + error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffer in av1_apply_selfguided_restoration"); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void wiener_filter_stripe_highbd( + const RestorationUnitInfo *rui, int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + (void)tmpbuf; + (void)error_info; + const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); + const uint8_t *src8_p = src8 + j; + uint8_t *dst8_p = dst8 + j; + av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride, + rui->wiener_info.hfilter, 16, + rui->wiener_info.vfilter, 16, w, + stripe_height, &conv_params, bit_depth); + } +} + +static void sgrproj_filter_stripe_highbd( + const RestorationUnitInfo *rui, int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, stripe_width - j); + if (av1_apply_selfguided_restoration( + src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, + 1) != 0) { + aom_internal_error( + error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffer in av1_apply_selfguided_restoration"); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info); + +#if CONFIG_AV1_HIGHBITDEPTH +#define NUM_STRIPE_FILTERS 4 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, + sgrproj_filter_stripe_highbd +}; +#else +#define NUM_STRIPE_FILTERS 2 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe +}; +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Filter one restoration unit +void av1_loop_restoration_filter_unit( + const RestorationTileLimits *limits, const RestorationUnitInfo *rui, + const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, + int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth, + uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, + int optimized_lr, struct aom_internal_error_info *error_info) { + RestorationType unit_rtype = rui->restoration_type; + + int unit_h = limits->v_end - limits->v_start; + int unit_w = limits->h_end - limits->h_start; + uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start; + uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start; + + if (unit_rtype == RESTORE_NONE) { + copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, + highbd); + return; + } + + const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ); + assert(filter_idx < NUM_STRIPE_FILTERS); + const stripe_filter_fun stripe_filter = stripe_filters[filter_idx]; + + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + + // Filter the whole image one stripe at a time + RestorationTileLimits remaining_stripes = *limits; + int i = 0; + while (i < unit_h) { + int copy_above, copy_below; + remaining_stripes.v_start = limits->v_start + i; + + get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y, + ©_above, ©_below); + + const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; + + // Work out where this stripe's boundaries are within + // rsb->stripe_boundary_{above,below} + const int frame_stripe = + (remaining_stripes.v_start + runit_offset) / full_stripe_height; + const int rsb_row = RESTORATION_CTX_VERT * frame_stripe; + + // Calculate this stripe's height, based on two rules: + // * The topmost stripe in the frame is 8 luma pixels shorter than usual. + // * We can't extend past the end of the current restoration unit + const int nominal_stripe_height = + full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0); + const int h = AOMMIN(nominal_stripe_height, + remaining_stripes.v_end - remaining_stripes.v_start); + + setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd, + h, data8, stride, rlbs, copy_above, + copy_below, optimized_lr); + + stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride, + dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth, + error_info); + + restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h, + data8, stride, copy_above, copy_below, + optimized_lr); + + i += h; + } +} + +static void filter_frame_on_unit(const RestorationTileLimits *limits, + int rest_unit_idx, void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv; + const RestorationInfo *rsi = ctxt->rsi; + + av1_loop_restoration_filter_unit( + limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, + ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, + ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8, + ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info); +} + +void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, + YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + int num_planes) { + const SequenceHeader *const seq_params = cm->seq_params; + const int bit_depth = seq_params->bit_depth; + const int highbd = seq_params->use_highbitdepth; + lr_ctxt->dst = &cm->rst_frame; + + const int frame_width = frame->crop_widths[0]; + const int frame_height = frame->crop_heights[0]; + if (aom_realloc_frame_buffer( + lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) != AOM_CODEC_OK) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate restoration dst buffer"); + + lr_ctxt->on_rest_unit = filter_frame_on_unit; + lr_ctxt->frame = frame; + for (int plane = 0; plane < num_planes; ++plane) { + RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationType rtype = rsi->frame_restoration_type; + rsi->optimized_lr = optimized_lr; + lr_ctxt->ctxt[plane].rsi = rsi; + + if (rtype == RESTORE_NONE) { + continue; + } + + const int is_uv = plane > 0; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + assert(plane_w == frame->crop_widths[is_uv]); + assert(plane_h == frame->crop_heights[is_uv]); + + av1_extend_frame(frame->buffers[plane], plane_w, plane_h, + frame->strides[is_uv], RESTORATION_BORDER, + RESTORATION_BORDER, highbd); + + FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane]; + lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; + lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y; + lr_plane_ctxt->plane_w = plane_w; + lr_plane_ctxt->plane_h = plane_h; + lr_plane_ctxt->highbd = highbd; + lr_plane_ctxt->bit_depth = bit_depth; + lr_plane_ctxt->data8 = frame->buffers[plane]; + lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane]; + lr_plane_ctxt->data_stride = frame->strides[is_uv]; + lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv]; + } +} + +void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, + AV1_COMMON *cm, int num_planes) { + typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, + int vstart, int vend); + static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, + aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v }; + assert(num_planes <= 3); + for (int plane = 0; plane < num_planes; ++plane) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane]; + copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0, + lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h); + } +} + +static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm, + int num_planes) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + for (int plane = 0; plane < num_planes; ++plane) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) { + continue; + } + + av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, + &ctxt[plane], cm->rst_tmpbuf, cm->rlbs); + } +} + +void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + void *lr_ctxt) { + assert(!cm->features.all_lossless); + const int num_planes = av1_num_planes(cm); + + AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; + + av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, + optimized_lr, num_planes); + + foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes); + + av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes); +} + +void av1_foreach_rest_unit_in_row( + RestorationTileLimits *limits, int plane_w, + rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, + int hnum_rest_units, int vnum_rest_units, int plane, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, + sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync, + struct aom_internal_error_info *error_info) { + const int ext_size = unit_size * 3 / 2; + int x0 = 0, j = 0; + while (x0 < plane_w) { + int remaining_w = plane_w - x0; + int w = (remaining_w < ext_size) ? remaining_w : unit_size; + + limits->h_start = x0; + limits->h_end = x0 + w; + assert(limits->h_end <= plane_w); + + const int unit_idx = row_number * hnum_rest_units + j; + + // No sync for even numbered rows + // For odd numbered rows, Loop Restoration of current block requires the LR + // of top-right and bottom-right blocks to be completed + + // top-right sync + on_sync_read(lr_sync, row_number, j, plane); + if ((row_number + 1) < vnum_rest_units) + // bottom-right sync + on_sync_read(lr_sync, row_number + 2, j, plane); + +#if CONFIG_MULTITHREAD + if (lr_sync && lr_sync->num_workers > 1) { + pthread_mutex_lock(lr_sync->job_mutex); + const bool lr_mt_exit = lr_sync->lr_mt_exit; + pthread_mutex_unlock(lr_sync->job_mutex); + // Exit in case any worker has encountered an error. + if (lr_mt_exit) return; + } +#endif + + on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info); + + on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane); + + x0 += w; + ++j; + } +} + +void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) { + (void)lr_sync; + (void)r; + (void)c; + (void)plane; +} + +void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, + const int sb_cols, int plane) { + (void)lr_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +} + +void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, + rest_unit_visitor_t on_rest_unit, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + const RestorationInfo *rsi = &cm->rst_info[plane]; + const int hnum_rest_units = rsi->horz_units; + const int vnum_rest_units = rsi->vert_units; + const int unit_size = rsi->restoration_unit_size; + + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int ext_size = unit_size * 3 / 2; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + int y0 = 0, i = 0; + while (y0 < plane_h) { + int remaining_h = plane_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : unit_size; + + RestorationTileLimits limits; + limits.v_start = y0; + limits.v_end = y0 + h; + assert(limits.v_end <= plane_h); + // Offset upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(0, limits.v_start - voffset); + if (limits.v_end < plane_h) limits.v_end -= voffset; + + av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size, + hnum_rest_units, vnum_rest_units, plane, priv, + tmpbuf, rlbs, av1_lr_sync_read_dummy, + av1_lr_sync_write_dummy, NULL, cm->error); + + y0 += h; + ++i; + } +} + +int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rcol0, int *rcol1, int *rrow0, + int *rrow1) { + assert(rcol0 && rcol1 && rrow0 && rrow1); + + if (bsize != cm->seq_params->sb_size) return 0; + + assert(!cm->features.all_lossless); + + const int is_uv = plane > 0; + + // Compute the mi-unit corners of the superblock + const int mi_row0 = mi_row; + const int mi_col0 = mi_col; + const int mi_row1 = mi_row0 + mi_size_high[bsize]; + const int mi_col1 = mi_col0 + mi_size_wide[bsize]; + + const RestorationInfo *rsi = &cm->rst_info[plane]; + const int size = rsi->restoration_unit_size; + const int horz_units = rsi->horz_units; + const int vert_units = rsi->vert_units; + + // The size of an MI-unit on this plane of the image + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int mi_size_x = MI_SIZE >> ss_x; + const int mi_size_y = MI_SIZE >> ss_y; + + // Write m for the relative mi column or row, D for the superres denominator + // and N for the superres numerator. If u is the upscaled pixel offset then + // we can write the downscaled pixel offset in two ways as: + // + // MI_SIZE * m = N / D u + // + // from which we get u = D * MI_SIZE * m / N + const int mi_to_num_x = av1_superres_scaled(cm) + ? mi_size_x * cm->superres_scale_denominator + : mi_size_x; + const int mi_to_num_y = mi_size_y; + const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size; + const int denom_y = size; + + const int rnd_x = denom_x - 1; + const int rnd_y = denom_y - 1; + + // rcol0/rrow0 should be the first column/row of restoration units that + // doesn't start left/below of mi_col/mi_row. For this calculation, we need + // to round up the division (if the sb starts at runit column 10.1, the first + // matching runit has column index 11) + *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x; + *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y; + + // rel_col1/rel_row1 is the equivalent calculation, but for the superblock + // below-right. If we're at the bottom or right of the frame, this restoration + // unit might not exist, in which case we'll clamp accordingly. + *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units); + *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units); + + return *rcol0 < *rcol1 && *rrow0 < *rrow1; +} + +// Extend to left and right +static void extend_lines(uint8_t *buf, int width, int height, int stride, + int extend, int use_highbitdepth) { + for (int i = 0; i < height; ++i) { + if (use_highbitdepth) { + uint16_t *buf16 = (uint16_t *)buf; + aom_memset16(buf16 - extend, buf16[0], extend); + aom_memset16(buf16 + width, buf16[width - 1], extend); + } else { + memset(buf - extend, buf[0], extend); + memset(buf + width, buf[width - 1], extend); + } + buf += stride; + } +} + +static void save_deblock_boundary_lines( + const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row, + int stripe, int use_highbd, int is_above, + RestorationStripeBoundaries *boundaries) { + const int is_uv = plane > 0; + const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); + const int src_stride = frame->strides[is_uv] << use_highbd; + const uint8_t *src_rows = src_buf + row * src_stride; + + uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above + : boundaries->stripe_boundary_below; + uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); + const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; + uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; + + // There is a rare case in which a processing stripe can end 1px above the + // crop border. In this case, we do want to use deblocked pixels from below + // the stripe (hence why we ended up in this function), but instead of + // fetching 2 "below" rows we need to fetch one and duplicate it. + // This is equivalent to clamping the sample locations against the crop border + const int lines_to_save = + AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row); + assert(lines_to_save == 1 || lines_to_save == 2); + + int upscaled_width; + int line_bytes; + if (av1_superres_scaled(cm)) { + const int ss_x = is_uv && cm->seq_params->subsampling_x; + upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; + line_bytes = upscaled_width << use_highbd; + if (use_highbd) + av1_upscale_normative_rows( + cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv], + CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride, + plane, lines_to_save); + else + av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows, + boundaries->stripe_boundary_stride, plane, + lines_to_save); + } else { + upscaled_width = frame->crop_widths[is_uv]; + line_bytes = upscaled_width << use_highbd; + for (int i = 0; i < lines_to_save; i++) { + memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride, + line_bytes); + } + } + // If we only saved one line, then copy it into the second line buffer + if (lines_to_save == 1) + memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes); + + extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, + RESTORATION_EXTRA_HORZ, use_highbd); +} + +static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, + const AV1_COMMON *cm, int plane, int row, + int stripe, int use_highbd, int is_above, + RestorationStripeBoundaries *boundaries) { + const int is_uv = plane > 0; + const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); + const int src_stride = frame->strides[is_uv] << use_highbd; + const uint8_t *src_rows = src_buf + row * src_stride; + + uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above + : boundaries->stripe_boundary_below; + uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); + const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; + uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; + const int src_width = frame->crop_widths[is_uv]; + + // At the point where this function is called, we've already applied + // superres. So we don't need to extend the lines here, we can just + // pull directly from the topmost row of the upscaled frame. + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int upscaled_width = av1_superres_scaled(cm) + ? (cm->superres_upscaled_width + ss_x) >> ss_x + : src_width; + const int line_bytes = upscaled_width << use_highbd; + for (int i = 0; i < RESTORATION_CTX_VERT; i++) { + // Copy the line at 'src_rows' into both context lines + memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes); + } + extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, + RESTORATION_EXTRA_HORZ, use_highbd); +} + +static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd, + int plane, AV1_COMMON *cm, int after_cdef) { + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries; + + const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y); + + int stripe_idx; + for (stripe_idx = 0;; ++stripe_idx) { + const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off); + const int y0 = rel_y0; + if (y0 >= plane_h) break; + + const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off; + const int y1 = AOMMIN(rel_y1, plane_h); + + // Extend using CDEF pixels at the top and bottom of the frame, + // and deblocked pixels at internal stripe boundaries + const int use_deblock_above = (stripe_idx > 0); + const int use_deblock_below = (y1 < plane_height); + + if (!after_cdef) { + // Save deblocked context at internal stripe boundaries + if (use_deblock_above) { + save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT, + stripe_idx, use_highbd, 1, boundaries); + } + if (use_deblock_below) { + save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx, + use_highbd, 0, boundaries); + } + } else { + // Save CDEF context at frame boundaries + if (!use_deblock_above) { + save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd, + 1, boundaries); + } + if (!use_deblock_below) { + save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx, + use_highbd, 0, boundaries); + } + } + } +} + +// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan +// lines to be used as boundary in the loop restoration process. The +// lines are saved in rst_internal.stripe_boundary_lines +void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int after_cdef) { + const int num_planes = av1_num_planes(cm); + const int use_highbd = cm->seq_params->use_highbitdepth; + for (int p = 0; p < num_planes; ++p) { + save_boundary_lines(frame, use_highbd, p, cm, after_cdef); + } +} diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h new file mode 100644 index 0000000000..644e06980f --- /dev/null +++ b/third_party/aom/av1/common/restoration.h @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RESTORATION_H_ +#define AOM_AV1_COMMON_RESTORATION_H_ + +#include "aom_ports/mem.h" +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! @file */ + +/*!\cond */ + +// Border for Loop restoration buffer +#define AOM_RESTORATION_FRAME_BORDER 32 +#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) +#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) + +#define RESTORATION_PROC_UNIT_SIZE 64 + +// Filter stripe grid offset upwards compared to the superblock grid +#define RESTORATION_UNIT_OFFSET 8 + +#define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr +#define SGRPROJ_BORDER_HORZ 3 // Horizontal border used for Sgr + +#define WIENER_BORDER_VERT 2 // Vertical border used for Wiener +#define WIENER_HALFWIN 3 +#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener + +// RESTORATION_BORDER_VERT determines line buffer requirement for LR. +// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT. +// Note the line buffer needed is twice the value of this macro. +#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT +#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT) +#else +#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ +#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ) +#else +#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +// How many border pixels do we need for each processing unit? +#define RESTORATION_BORDER 3 + +// How many rows of deblocked pixels do we save above/below each processing +// stripe? +#define RESTORATION_CTX_VERT 2 + +// Additional pixels to the left and right in above/below buffers +// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment +#define RESTORATION_EXTRA_HORZ 4 + +// Pad up to 20 more (may be much less is needed) +#define RESTORATION_PADDING 20 +#define RESTORATION_PROC_UNIT_PELS \ + ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \ + RESTORATION_PADDING) * \ + (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \ + RESTORATION_PADDING)) + +#define RESTORATION_UNITSIZE_MAX 256 +#define RESTORATION_UNITPELS_HORZ_MAX \ + (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) +#define RESTORATION_UNITPELS_VERT_MAX \ + ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \ + RESTORATION_UNIT_OFFSET)) +#define RESTORATION_UNITPELS_MAX \ + (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX) + +// Two 32-bit buffers needed for the restored versions from two filters +// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored +// on the decoder side. +#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t)) + +#define SGRPROJ_EXTBUF_SIZE (0) +#define SGRPROJ_PARAMS_BITS 4 +#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS) + +// Precision bits for projection +#define SGRPROJ_PRJ_BITS 7 +// Restoration precision bits generated higher than source before projection +#define SGRPROJ_RST_BITS 4 +// Internal precision bits for core selfguided_restoration +#define SGRPROJ_SGR_BITS 8 +#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS) + +#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4) +#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1) +#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4) +#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1) + +#define SGRPROJ_PRJ_SUBEXP_K 4 + +#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS) + +#define MAX_RADIUS 2 // Only 1, 2, 3 allowed +#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) +#define SGRPROJ_MTABLE_BITS 20 +#define SGRPROJ_RECIP_BITS 12 + +#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1) +#define WIENER_WIN (2 * WIENER_HALFWIN + 1) +#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN)) +#define WIENER_TMPBUF_SIZE (0) +#define WIENER_EXTBUF_SIZE (0) + +// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for +// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. +#define WIENER_WIN_CHROMA (WIENER_WIN - 2) +#define WIENER_WIN_REDUCED (WIENER_WIN - 2) +#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) +#define WIENER_STATS_DOWNSAMPLE_FACTOR 4 + +#define WIENER_FILT_PREC_BITS 7 +#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS) + +// Central values for the taps +#define WIENER_FILT_TAP0_MIDV (3) +#define WIENER_FILT_TAP1_MIDV (-7) +#define WIENER_FILT_TAP2_MIDV (15) +#define WIENER_FILT_TAP3_MIDV \ + (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \ + WIENER_FILT_TAP2_MIDV)) + +#define WIENER_FILT_TAP0_BITS 4 +#define WIENER_FILT_TAP1_BITS 5 +#define WIENER_FILT_TAP2_BITS 6 + +#define WIENER_FILT_BITS \ + ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2) + +#define WIENER_FILT_TAP0_MINV \ + (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2) +#define WIENER_FILT_TAP1_MINV \ + (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2) +#define WIENER_FILT_TAP2_MINV \ + (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2) + +#define WIENER_FILT_TAP0_MAXV \ + (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2) +#define WIENER_FILT_TAP1_MAXV \ + (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2) +#define WIENER_FILT_TAP2_MAXV \ + (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2) + +#define WIENER_FILT_TAP0_SUBEXP_K 1 +#define WIENER_FILT_TAP1_SUBEXP_K 2 +#define WIENER_FILT_TAP2_SUBEXP_K 3 + +// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE +#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE) + +// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE +#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE) + +// Check the assumptions of the existing code +#if SUBPEL_TAPS != WIENER_WIN + 1 +#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1" +#endif +#if WIENER_FILT_PREC_BITS != 7 +#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7" +#endif + +typedef struct { + int r[2]; // radii + int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable() +} sgr_params_type; +/*!\endcond */ + +/*!\brief Parameters related to Restoration Unit Info */ +typedef struct { + /*! + * restoration type + */ + RestorationType restoration_type; + + /*! + * Wiener filter parameters if restoration_type indicates Wiener + */ + WienerInfo wiener_info; + + /*! + * Sgrproj filter parameters if restoration_type indicates Sgrproj + */ + SgrprojInfo sgrproj_info; +} RestorationUnitInfo; + +/*!\cond */ + +// A restoration line buffer needs space for two lines plus a horizontal filter +// margin of RESTORATION_EXTRA_HORZ on each side. +#define RESTORATION_LINEBUFFER_WIDTH \ + (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ) + +typedef struct { + // Temporary buffers to save/restore 3 lines above/below the restoration + // stripe. + uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; + uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; +} RestorationLineBuffers; +/*!\endcond */ + +/*!\brief Parameters related to Restoration Stripe boundaries */ +typedef struct { + /*! + * stripe boundary above + */ + uint8_t *stripe_boundary_above; + + /*! + * stripe boundary below + */ + uint8_t *stripe_boundary_below; + + /*! + * strides for stripe boundaries above and below + */ + int stripe_boundary_stride; + + /*! + * size of stripe boundaries above and below + */ + int stripe_boundary_size; +} RestorationStripeBoundaries; + +/*!\brief Parameters related to Restoration Info */ +typedef struct { + /*! + * Restoration type for frame + */ + RestorationType frame_restoration_type; + + /*! + * Restoration unit size + */ + int restoration_unit_size; + + /** + * \name Fields allocated and initialised by av1_alloc_restoration_struct. + */ + /**@{*/ + /*! + * Total number of restoration units in this plane + */ + int num_rest_units; + + /*! + * Number of vertical restoration units in this plane + */ + int vert_units; + + /*! + * Number of horizontal restoration units in this plane + */ + int horz_units; + /**@}*/ + + /*! + * Parameters for each restoration unit in this plane + */ + RestorationUnitInfo *unit_info; + + /*! + * Restoration Stripe boundary info + */ + RestorationStripeBoundaries boundaries; + + /*! + * Whether optimized lr can be used for speed. + * That includes cases of no cdef and no superres, or if fast trial runs + * are used on the encoder side. + */ + int optimized_lr; +} RestorationInfo; + +/*!\cond */ + +static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) { + sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2; + sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2; +} + +static INLINE void set_default_wiener(WienerInfo *wiener_info) { + wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV; + wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV; + wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV; + wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] = + -2 * + (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV); + wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV; + wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV; + wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV; +} + +typedef struct { + int h_start, h_end, v_start, v_end; +} RestorationTileLimits; + +typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info); + +typedef struct FilterFrameCtxt { + const RestorationInfo *rsi; + int ss_x, ss_y; + int plane_w, plane_h; + int highbd, bit_depth; + uint8_t *data8, *dst8; + int data_stride, dst_stride; +} FilterFrameCtxt; + +typedef struct AV1LrStruct { + rest_unit_visitor_t on_rest_unit; + FilterFrameCtxt ctxt[MAX_MB_PLANE]; + YV12_BUFFER_CONFIG *frame; + YV12_BUFFER_CONFIG *dst; +} AV1LrStruct; + +extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS]; +extern int sgrproj_mtable[SGRPROJ_PARAMS][2]; +extern const int32_t av1_x_by_xplus1[256]; +extern const int32_t av1_one_by_x[MAX_NELEM]; + +void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi, + int is_uv); +void av1_free_restoration_struct(RestorationInfo *rst_info); + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd); +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params); + +/*!\endcond */ + +/*!\brief Function for applying loop restoration filter to a single unit. + * + * \ingroup in_loop_restoration + * This function applies the loop restoration filter to a single + * loop restoration unit. + * + * \param[in] limits Limits of the unit + * \param[in] rui The parameters to use for this unit and its + * coefficients + * \param[in] rsb Deblocked pixels to use for stripe boundaries + * \param[in] rlbs Space to use as a scratch buffer + * \param[in] ss_x Horizontal subsampling for plane + * \param[in] ss_y Vertical subsampling for plane + * \param[in] plane_w Width of the current plane + * \param[in] plane_h Height of the current plane + * \param[in] highbd Whether high bitdepth pipeline is used + * \param[in] bit_depth Bit-depth of the video + * \param[in] data8 Frame data (pointing at the top-left corner of + * the frame, not the restoration unit). + * \param[in] stride Stride of \c data8 + * \param[out] dst8 Buffer where the results will be written. Like + * \c data8, \c dst8 should point at the top-left + * corner of the frame + * \param[in] dst_stride Stride of \c dst8 + * \param[in] tmpbuf Scratch buffer used by the sgrproj filter + * which should be at least SGRPROJ_TMPBUF_SIZE + * big. + * \param[in] optimized_lr Whether to use fast optimized Loop Restoration + * \param[in,out] error_info Error info for reporting errors + * + * \remark Nothing is returned. Instead, the filtered unit is output in + * \c dst8 at the proper restoration unit offset. + */ +void av1_loop_restoration_filter_unit( + const RestorationTileLimits *limits, const RestorationUnitInfo *rui, + const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, + int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth, + uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, + int optimized_lr, struct aom_internal_error_info *error_info); + +/*!\brief Function for applying loop restoration filter to a frame + * + * \ingroup in_loop_restoration + * This function applies the loop restoration filter to a frame. + * + * \param[in,out] frame Compressed frame buffer + * \param[in,out] cm Pointer to top level common structure + * \param[in] optimized_lr Whether to use fast optimized Loop Restoration + * \param[in] lr_ctxt Loop restoration context + * + * \remark Nothing is returned. Instead, the filtered frame is output in + * \c frame. + */ +void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, int optimized_lr, + void *lr_ctxt); +/*!\cond */ + +void av1_loop_restoration_precal(void); + +struct AV1LrSyncData; + +typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane); + +typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c, + const int sb_cols, int plane); + +// Call on_rest_unit for each loop restoration unit in the plane. +void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, + rest_unit_visitor_t on_rest_unit, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs); + +// Return 1 iff the block at mi_row, mi_col with size bsize is a +// top-level superblock containing the top-left corner of at least one +// loop restoration unit. +// +// If the block is a top-level superblock, the function writes to +// *rcol0, *rcol1, *rrow0, *rrow1. This means that the parameters for all +// restoration units in the rectangle [*rcol0, *rcol1) x [*rrow0, *rrow1) +// are signaled in this superblock. +int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rcol0, int *rcol1, int *rrow0, + int *rrow1); + +void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int after_cdef); +void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, + YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int optimized_lr, int num_planes); +void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, + struct AV1Common *cm, int num_planes); +void av1_foreach_rest_unit_in_row( + RestorationTileLimits *limits, int plane_w, + rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, + int hnum_rest_units, int vnum_rest_units, int plane, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, + sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync, + struct aom_internal_error_info *error_info); + +void av1_get_upsampled_plane_size(const struct AV1Common *cm, int is_uv, + int *plane_w, int *plane_h); +int av1_lr_count_units(int unit_size, int plane_size); +void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane); +void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, + const int sb_cols, int plane); + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RESTORATION_H_ diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c new file mode 100644 index 0000000000..d7c6a24378 --- /dev/null +++ b/third_party/aom/av1/common/scale.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/filter.h" +#include "av1/common/scale.h" +#include "aom_dsp/aom_filter.h" + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size; +} + +// Given the fixed point scale, calculate coarse point scale. +static int fixed_point_scale_to_coarse_point_scale(int scale_fp) { + return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS); +} + +// Note: x and y are integer precision, mvq4 is q4 precision. +MV32 av1_scale_mv(const MV *mvq4, int x, int y, + const struct scale_factors *sf) { + const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf); + const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf); + const MV32 res = { + av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4, + av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 + }; + return res; +} + +void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h) { + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; + return; + } + + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + + sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp); + sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp); +} diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h new file mode 100644 index 0000000000..d8481bfc2c --- /dev/null +++ b/third_party/aom/av1/common/scale.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SCALE_H_ +#define AOM_AV1_COMMON_SCALE_H_ + +#include "av1/common/convolve.h" +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCALE_NUMERATOR 8 + +#define REF_SCALE_SHIFT 14 +#define REF_NO_SCALE (1 << REF_SCALE_SHIFT) +#define REF_INVALID_SCALE -1 + +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_step_q4; + int y_step_q4; +}; + +// Note: Expect val to be in q4 precision +static INLINE int av1_scaled_x(int val, const struct scale_factors *sf) { + const int off = + (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); + const int64_t tval = (int64_t)val * sf->x_scale_fp + off; + return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, + REF_SCALE_SHIFT - SCALE_EXTRA_BITS); +} + +// Note: Expect val to be in q4 precision +static INLINE int av1_scaled_y(int val, const struct scale_factors *sf) { + const int off = + (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); + const int64_t tval = (int64_t)val * sf->y_scale_fp + off; + return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, + REF_SCALE_SHIFT - SCALE_EXTRA_BITS); +} + +// Note: Expect val to be in q4 precision +static INLINE int av1_unscaled_value(int val, const struct scale_factors *sf) { + (void)sf; + return val * (1 << SCALE_EXTRA_BITS); +} + +MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); + +void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h); + +static INLINE int av1_is_valid_scale(const struct scale_factors *sf) { + assert(sf != NULL); + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; +} + +static INLINE int av1_is_scaled(const struct scale_factors *sf) { + assert(sf != NULL); + return av1_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); +} + +// See AV1 spec, Section 6.8.6. Frame size with refs semantics. +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && 2 * this_height >= ref_height && + this_width <= 16 * ref_width && this_height <= 16 * ref_height; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SCALE_H_ diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c new file mode 100644 index 0000000000..0943579db1 --- /dev/null +++ b/third_party/aom/av1/common/scan.c @@ -0,0 +1,2038 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common_data.h" +#include "av1/common/scan.h" + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { + 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = { + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = { + 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19, + 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35, + 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39, + 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, + 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, + 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, + 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226, + 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10, + 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, + 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, + 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109, + 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142, + 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175, + 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208, + 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, + 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, + 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58, + 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91, + 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124, + 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126, + 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, + 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, + 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, + 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, + 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, + 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, + 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, + 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, + 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, + 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, + 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, + 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, + 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, + 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, + 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, + 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, + 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, + 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, + 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, + 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, + 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, + 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, + 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, + 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, + 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, + 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, + 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, + 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, + 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, + 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, + 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, + 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, + 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, + 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, + 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { + 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, + 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67, + 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, + 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71, + 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73, + 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75, + 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, + 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, + 117, 124, 111, 118, 125, 119, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, + 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, + 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, + 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, + 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, + 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, + 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, + 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, + 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, + 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, + 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, + 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, + 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238, + 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, + 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, + 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, + 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, + 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, + 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, + 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, + 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494, + 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, + 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, + 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, + 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, + 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, + 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500, + 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408, + 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285, + 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, + 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, + 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, + 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = { + 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, + 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, + 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, + 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, + 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, + 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, + 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, + 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, + 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, + 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, + 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, + 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, + 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, + 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, + 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, + 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, + 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, + 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, + 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, + 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, + 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, + 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, + 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, + 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, + 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, + 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, + 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, + 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, + 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, + 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, + 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, + 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, + 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, + 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, + 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, + 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, + 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, + 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, + 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, + 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, + 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, + 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, + 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, + 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, + 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, + 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, + 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, + 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, + 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, + 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, + 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, + 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, + 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, + 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, + 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, + 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, + 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, + 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, + 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, + 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, + 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, + 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, + 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, + 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, + 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, + 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, + 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, + 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, + 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, + 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, + 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, + 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, + 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, + 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, + 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, + 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, + 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, + 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, + 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, + 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, + 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, + 495, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { + 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, + 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, + 82, 67, 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, + 144, 129, 114, 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, + 85, 100, 115, 130, 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, + 41, 26, 11, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, + 192, 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, 28, 13, + 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, + 240, 225, 210, 195, 180, 165, 150, 135, 120, 105, 90, 75, 60, 45, 30, + 15, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, + 241, 242, 227, 212, 197, 182, 167, 152, 137, 122, 107, 92, 77, 62, 47, + 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 244, 229, + 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110, 125, 140, 155, + 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156, 141, 126, + 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203, 188, + 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190, + 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = { + 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = { + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = { + 0, 2, 5, 9, 13, 17, 21, 25, 1, 4, 8, 12, 16, 20, 24, 28, + 3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = { + 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, + 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = { + 0, 2, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, + 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62, + 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = { + 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, + 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34, + 29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50, + 45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, + 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, + 219, 227, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, + 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, + 201, 209, 217, 225, 233, 240, 6, 11, 17, 24, 32, 40, 48, 56, 64, + 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 239, 245, 10, 16, 23, 31, 39, 47, 55, + 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, + 183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15, 22, 30, 38, 46, + 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, + 174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28, + 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, + 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, + 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, + 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, + 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, + 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, + 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 108, 67, 74, + 81, 88, 95, 102, 109, 116, 75, 82, 89, 96, 103, 110, 117, 124, 83, + 90, 97, 104, 111, 118, 125, 132, 91, 98, 105, 112, 119, 126, 133, 140, + 99, 106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149, + 156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158, + 165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167, + 174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176, + 183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185, + 192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194, + 201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203, + 210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250, + 219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, + 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, + 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, + 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, + 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, + 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, + 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, + 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, + 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, + 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, + 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, + 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, + 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, + 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, + 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, + 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, + 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, + 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, + 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, + 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, + 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, + 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, + 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, + 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, + 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, + 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, + 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, + 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 112, + 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 111, 117, + 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 110, 116, 121, + 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 109, 115, 120, 124, + 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 108, 114, 119, 123, 126, + 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36, + 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52, + 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68, + 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84, + 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100, + 59, 66, 73, 80, 87, 94, 101, 107, 67, 74, 81, 88, 95, 102, 108, 113, + 75, 82, 89, 96, 103, 109, 114, 118, 83, 90, 97, 104, 110, 115, 119, 122, + 91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90, 104, 119, + 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, + 375, 391, 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76, 89, 103, + 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, + 358, 374, 390, 406, 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88, + 102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, + 341, 357, 373, 389, 405, 420, 6, 11, 17, 24, 32, 41, 51, 62, 74, + 87, 101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, + 324, 340, 356, 372, 388, 404, 419, 433, 10, 16, 23, 31, 40, 50, 61, + 73, 86, 100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, + 307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15, 22, 30, 39, 49, + 60, 72, 85, 99, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, + 290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21, 29, 38, + 48, 59, 71, 84, 98, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, + 273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28, + 37, 47, 58, 70, 83, 97, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465, + 475, 36, 46, 57, 69, 82, 96, 111, 127, 143, 159, 175, 191, 207, 223, + 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453, + 464, 474, 483, 45, 56, 68, 81, 95, 110, 126, 142, 158, 174, 190, 206, + 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440, + 452, 463, 473, 482, 490, 55, 67, 80, 94, 109, 125, 141, 157, 173, 189, + 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426, + 439, 451, 462, 472, 481, 489, 496, 66, 79, 93, 108, 124, 140, 156, 172, + 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411, + 425, 438, 450, 461, 471, 480, 488, 495, 501, 78, 92, 107, 123, 139, 155, + 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, + 410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91, 106, 122, 138, + 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, + 394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121, + 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, + 377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510, + 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, + 360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506, + 509, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, + 120, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 67, 79, 92, 106, + 121, 136, 5, 8, 12, 17, 23, 30, 38, 47, 57, 68, 80, 93, 107, + 122, 137, 152, 9, 13, 18, 24, 31, 39, 48, 58, 69, 81, 94, 108, + 123, 138, 153, 168, 14, 19, 25, 32, 40, 49, 59, 70, 82, 95, 109, + 124, 139, 154, 169, 184, 20, 26, 33, 41, 50, 60, 71, 83, 96, 110, + 125, 140, 155, 170, 185, 200, 27, 34, 42, 51, 61, 72, 84, 97, 111, + 126, 141, 156, 171, 186, 201, 216, 35, 43, 52, 62, 73, 85, 98, 112, + 127, 142, 157, 172, 187, 202, 217, 232, 44, 53, 63, 74, 86, 99, 113, + 128, 143, 158, 173, 188, 203, 218, 233, 248, 54, 64, 75, 87, 100, 114, + 129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65, 76, 88, 101, 115, + 130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77, 89, 102, 116, + 131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90, 103, 117, + 132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118, + 133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119, + 134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344, + 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345, + 360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346, + 361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347, + 362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348, + 363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349, + 364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350, + 365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351, + 366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352, + 367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353, + 368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354, + 369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355, + 370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356, + 371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357, + 372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358, + 373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359, + 374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, + 375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, + 509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, + 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, + 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, + 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, + 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, + 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, + 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, + 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, + 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, + 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, + 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, + 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, + 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, + 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, + 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, + 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, + 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, + 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, + 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, + 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, + 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, + 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, + 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, + 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, + 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, + 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, + 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, + 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, + 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, + 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, + 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, + 495, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, + 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, + 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, + 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, + 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, + 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, + 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, + 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, + 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, + 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, + 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, + 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, + 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, + 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, + 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, + 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, + 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, + 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, + 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, + 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, + 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, + 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, + 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, + 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78, 104, 105, + 135, 1, 4, 8, 11, 19, 22, 34, 37, 53, 56, 76, 79, 103, 106, + 134, 136, 5, 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107, + 133, 137, 164, 6, 13, 17, 24, 32, 39, 51, 58, 74, 81, 101, 108, + 132, 138, 163, 165, 14, 16, 25, 31, 40, 50, 59, 73, 82, 100, 109, + 131, 139, 162, 166, 189, 15, 26, 30, 41, 49, 60, 72, 83, 99, 110, + 130, 140, 161, 167, 188, 190, 27, 29, 42, 48, 61, 71, 84, 98, 111, + 129, 141, 160, 168, 187, 191, 210, 28, 43, 47, 62, 70, 85, 97, 112, + 128, 142, 159, 169, 186, 192, 209, 211, 44, 46, 63, 69, 86, 96, 113, + 127, 143, 158, 170, 185, 193, 208, 212, 227, 45, 64, 68, 87, 95, 114, + 126, 144, 157, 171, 184, 194, 207, 213, 226, 228, 65, 67, 88, 94, 115, + 125, 145, 156, 172, 183, 195, 206, 214, 225, 229, 240, 66, 89, 93, 116, + 124, 146, 155, 173, 182, 196, 205, 215, 224, 230, 239, 241, 90, 92, 117, + 123, 147, 154, 174, 181, 197, 204, 216, 223, 231, 238, 242, 249, 91, 118, + 122, 148, 153, 175, 180, 198, 203, 217, 222, 232, 237, 243, 248, 250, 119, + 121, 149, 152, 176, 179, 199, 202, 218, 221, 233, 236, 244, 247, 251, 254, + 120, 150, 151, 177, 178, 200, 201, 219, 220, 234, 235, 245, 246, 252, 253, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {}; + +const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = { + { + // TX_4X4 + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + }, + { + // TX_8X8 + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + }, + { + // TX_16X16 + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + }, + { + // TX_32X32 + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_64X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_4X8 + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + }, + { + // TX_8X4 + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + }, + { + // TX_8X16 + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + }, + { + // TX_16X8 + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + }, + { + // TX_16X32 + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + }, + { + // TX_32X16 + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + }, + { + // TX_32X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_64X32 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_4X16 + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + }, + { + // TX_16X4 + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + }, + { + // TX_8X32 + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + }, + { + // TX_32X8 + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + }, + { + // TX_16X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + }, + { + // TX_64X16 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + }, +}; diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h new file mode 100644 index 0000000000..4f369786f2 --- /dev/null +++ b/third_party/aom/av1/common/scan.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SCAN_H_ +#define AOM_AV1_COMMON_SCAN_H_ + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NEIGHBORS 2 + +enum { + SCAN_MODE_ZIG_ZAG, + SCAN_MODE_COL_DIAG, + SCAN_MODE_ROW_DIAG, + SCAN_MODE_COL_1D, + SCAN_MODE_ROW_1D, + SCAN_MODES +} UENUM1BYTE(SCAN_MODE); + +extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; + +void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd); + +static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size, + TX_TYPE tx_type) { + return &av1_scan_orders[tx_size][tx_type]; +} + +static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) { + return get_default_scan(tx_size, tx_type); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SCAN_H_ diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c new file mode 100644 index 0000000000..60b185161c --- /dev/null +++ b/third_party/aom/av1/common/seg_common.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_loopfilter.h" +#include "av1/common/blockd.h" +#include "av1/common/seg_common.h" +#include "av1/common/quant_common.h" + +static const int seg_feature_data_signed[SEG_LVL_MAX] = { + 1, 1, 1, 1, 1, 0, 0, 0 +}; + +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + 7, + 0, + 0 }; + +// These functions provide access to new segment level features. +// Eventually these function may be "optimized out" but for the moment, +// the coding mechanism is still subject to change so these provide a +// convenient single point of change. + +void av1_clearall_segfeatures(struct segmentation *seg) { + av1_zero(seg->feature_data); + av1_zero(seg->feature_mask); +} + +void av1_calculate_segdata(struct segmentation *seg) { + seg->segid_preskip = 0; + seg->last_active_segid = 0; + for (int i = 0; i < MAX_SEGMENTS; i++) { + for (int j = 0; j < SEG_LVL_MAX; j++) { + if (seg->feature_mask[i] & (1 << j)) { + seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME); + seg->last_active_segid = i; + } + } + } +} + +void av1_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] |= 1 << feature_id; +} + +int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_max[feature_id]; +} + +int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_signed[feature_id]; +} + +// The 'seg_data' given for each segment can be either deltas (from the default +// value chosen for the frame) or absolute values. +// +// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for +// SEGMENT_ALT_LF) +// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for +// SEGMENT_ALT_LF) +// +// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use +// the absolute values given). + +void av1_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data) { + if (seg_data < 0) { + assert(seg_feature_data_signed[feature_id]); + assert(-seg_data <= seg_feature_data_max[feature_id]); + } else { + assert(seg_data <= seg_feature_data_max[feature_id]); + } + + seg->feature_data[segment_id][feature_id] = seg_data; +} + +// TBD? Functions to read and write segment data with range / validity checking diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h new file mode 100644 index 0000000000..44b508b146 --- /dev/null +++ b/third_party/aom/av1/common/seg_common.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SEG_COMMON_H_ +#define AOM_AV1_COMMON_SEG_COMMON_H_ + +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS - 1) + +#define SEG_TEMPORAL_PRED_CTXS 3 +#define SPATIAL_PREDICTION_PROBS 3 + +enum { + SEG_LVL_ALT_Q, // Use alternate Quantizer .... + SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical + SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal + SEG_LVL_ALT_LF_U, // Use alternate loop filter value on u plane + SEG_LVL_ALT_LF_V, // Use alternate loop filter value on v plane + SEG_LVL_REF_FRAME, // Optional Segment reference frame + SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode + SEG_LVL_GLOBALMV, + SEG_LVL_MAX +} UENUM1BYTE(SEG_LVL_FEATURES); + +struct segmentation { + uint8_t enabled; + uint8_t update_map; + uint8_t update_data; + uint8_t temporal_update; + + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + unsigned int feature_mask[MAX_SEGMENTS]; + int last_active_segid; // The highest numbered segment id that has some + // enabled feature. + uint8_t segid_preskip; // Whether the segment id will be read before the + // skip syntax element. + // 1: the segment id will be read first. + // 0: the skip syntax element will be read first. +}; + +struct segmentation_probs { + aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)]; + aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS] + [CDF_SIZE(MAX_SEGMENTS)]; +}; + +static INLINE int segfeature_active(const struct segmentation *seg, + uint8_t segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id)); +} + +static INLINE void segfeatures_copy(struct segmentation *dst, + const struct segmentation *src) { + int i, j; + for (i = 0; i < MAX_SEGMENTS; i++) { + dst->feature_mask[i] = src->feature_mask[i]; + for (j = 0; j < SEG_LVL_MAX; j++) { + dst->feature_data[i][j] = src->feature_data[i][j]; + } + } + dst->segid_preskip = src->segid_preskip; + dst->last_active_segid = src->last_active_segid; +} + +void av1_clearall_segfeatures(struct segmentation *seg); + +void av1_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void av1_calculate_segdata(struct segmentation *seg); + +int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id); + +int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void av1_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data); + +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} + +static AOM_INLINE void set_segment_id(uint8_t *segment_ids, int mi_offset, + int x_mis, int y_mis, int mi_stride, + uint8_t segment_id) { + segment_ids += mi_offset; + for (int y = 0; y < y_mis; ++y) { + memset(&segment_ids[y * mi_stride], segment_id, + x_mis * sizeof(segment_ids[0])); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SEG_COMMON_H_ diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c new file mode 100644 index 0000000000..45695147ff --- /dev/null +++ b/third_party/aom/av1/common/thread_common.c @@ -0,0 +1,1250 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_image.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropymode.h" +#include "av1/common/thread_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +} + +static INLINE int get_lr_sync_range(int width) { +#if 0 + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +#else + (void)width; + return 1; +#endif +} + +// Allocate memory for lf row synchronization +void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, + int width, int num_workers) { + lf_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i, j; + + for (j = 0; j < MAX_MB_PLANE; j++) { + CHECK_MEM_ERROR(cm, lf_sync->mutex_[j], + aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows)); + if (lf_sync->mutex_[j]) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[j][i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->cond_[j], + aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows)); + if (lf_sync->cond_[j]) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[j][i], NULL); + } + } + } + + CHECK_MEM_ERROR(cm, lf_sync->job_mutex, + aom_malloc(sizeof(*(lf_sync->job_mutex)))); + if (lf_sync->job_mutex) { + pthread_mutex_init(lf_sync->job_mutex, NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lf_sync->lfdata, + aom_malloc(num_workers * sizeof(*(lf_sync->lfdata)))); + lf_sync->num_workers = num_workers; + + for (int j = 0; j < MAX_MB_PLANE; j++) { + CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j], + aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows)); + } + CHECK_MEM_ERROR( + cm, lf_sync->job_queue, + aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2)); + // Set up nsync. + lf_sync->sync_range = get_sync_range(width); +} + +// Deallocate lf synchronization related mutex and data +void av1_loop_filter_dealloc(AV1LfSync *lf_sync) { + if (lf_sync != NULL) { + int j; +#if CONFIG_MULTITHREAD + int i; + for (j = 0; j < MAX_MB_PLANE; j++) { + if (lf_sync->mutex_[j] != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex_[j][i]); + } + aom_free(lf_sync->mutex_[j]); + } + if (lf_sync->cond_[j] != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond_[j][i]); + } + aom_free(lf_sync->cond_[j]); + } + } + if (lf_sync->job_mutex != NULL) { + pthread_mutex_destroy(lf_sync->job_mutex); + aom_free(lf_sync->job_mutex); + } +#endif // CONFIG_MULTITHREAD + aom_free(lf_sync->lfdata); + for (j = 0; j < MAX_MB_PLANE; j++) { + aom_free(lf_sync->cur_sb_col[j]); + } + + aom_free(lf_sync->job_queue); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*lf_sync); + } +} + +void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, + int num_workers) { + if (num_workers < 1) return; +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } +#else + (void)cm; + (void)cdef_sync; +#endif // CONFIG_MULTITHREAD +} + +void av1_free_cdef_sync(AV1CdefSync *cdef_sync) { + if (cdef_sync == NULL) return; +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ != NULL) { + pthread_mutex_destroy(cdef_sync->mutex_); + aom_free(cdef_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD +} + +static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync, + int row) { + if (!row) return; +#if CONFIG_MULTITHREAD + AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; + pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_); + while (cdef_row_mt[row - 1].is_row_done != 1) + pthread_cond_wait(cdef_row_mt[row - 1].row_cond_, + cdef_row_mt[row - 1].row_mutex_); + cdef_row_mt[row - 1].is_row_done = 0; + pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_); +#else + (void)cdef_sync; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync, + int row) { +#if CONFIG_MULTITHREAD + AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; + pthread_mutex_lock(cdef_row_mt[row].row_mutex_); + pthread_cond_signal(cdef_row_mt[row].row_cond_); + cdef_row_mt[row].is_row_done = 1; + pthread_mutex_unlock(cdef_row_mt[row].row_mutex_); +#else + (void)cdef_sync; + (void)row; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c, + int plane) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1]; + pthread_mutex_lock(mutex); + + while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) { + pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, + const int sb_cols, int plane) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&lf_sync->mutex_[plane][r]); + + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur); + + pthread_cond_broadcast(&lf_sync->cond_[plane][r]); + pthread_mutex_unlock(&lf_sync->mutex_[plane][r]); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +// One job of row loopfiltering. +void av1_thread_loop_filter_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane, + int dir, int lpf_opt_level, AV1LfSync *const lf_sync, + struct aom_internal_error_info *error_info, + AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, + int num_mis_in_lpf_unit_height_log2) { + // TODO(aomedia:3276): Pass error_info to the low-level functions as required + // in future to handle error propagation. + (void)error_info; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2); + const int r = mi_row >> num_mis_in_lpf_unit_height_log2; + int mi_col, c; + + const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y; + const int num_planes = joint_filter_chroma ? 2 : 1; + assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U)); + + if (dir == 0) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { + c = mi_col >> MAX_MIB_SIZE_LOG2; + + av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + num_planes); + if (lpf_opt_level) { + if (plane == AOM_PLANE_Y) { + av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row, + mi_col, params_buf, tx_buf, + num_mis_in_lpf_unit_height_log2); + } else { + av1_filter_block_plane_vert_opt_chroma( + cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane, + joint_filter_chroma, num_mis_in_lpf_unit_height_log2); + } + } else { + av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } + if (lf_sync != NULL) { + sync_write(lf_sync, r, c, sb_cols, plane); + } + } + } else if (dir == 1) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { + c = mi_col >> MAX_MIB_SIZE_LOG2; + + if (lf_sync != NULL) { + // Wait for vertical edge filtering of the top-right block to be + // completed + sync_read(lf_sync, r, c, plane); + + // Wait for vertical edge filtering of the right block to be completed + sync_read(lf_sync, r + 1, c, plane); + } + +#if CONFIG_MULTITHREAD + if (lf_sync && lf_sync->num_workers > 1) { + pthread_mutex_lock(lf_sync->job_mutex); + const bool lf_mt_exit = lf_sync->lf_mt_exit; + pthread_mutex_unlock(lf_sync->job_mutex); + // Exit in case any worker has encountered an error. + if (lf_mt_exit) return; + } +#endif + + av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + num_planes); + if (lpf_opt_level) { + if (plane == AOM_PLANE_Y) { + av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row, + mi_col, params_buf, tx_buf, + num_mis_in_lpf_unit_height_log2); + } else { + av1_filter_block_plane_horz_opt_chroma( + cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane, + joint_filter_chroma, num_mis_in_lpf_unit_height_log2); + } + } else { + av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } + } + } +} + +void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync, + int num_mis_in_lpf_unit_height_log2) { + int plane, sb_row; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, num_mis_in_lpf_unit_height_log2); + const int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2); + + // In case of loopfilter row-multithreading, the worker on an SB row waits for + // the vertical edge filtering of the right and top-right SBs. Hence, in case + // a thread (main/worker) encounters an error, update that vertical + // loopfiltering of every SB row in the frame is complete in order to avoid + // dependent workers waiting indefinitely. + for (sb_row = 0; sb_row < sb_rows; ++sb_row) + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane); +} + +static AOM_INLINE void sync_lf_workers(AVxWorker *const workers, + AV1_COMMON *const cm, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = workers[0].had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + AVxWorker *const worker = &workers[0]; + error_info = ((LFWorkerData *)worker->data2)->error_info; + } + + // Wait till all rows are finished. + for (int i = num_workers - 1; i > 0; --i) { + AVxWorker *const worker = &workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((LFWorkerData *)worker->data2)->error_info; + } + } + if (had_error) aom_internal_error_copy(cm->error, &error_info); +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + AV1LfMTInfo *cur_job_info; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex_ = lf_sync->job_mutex; +#endif + + struct aom_internal_error_info *const error_info = &lf_data->error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(job_mutex_); + lf_sync->lf_mt_exit = true; + pthread_mutex_unlock(job_mutex_); +#endif + av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2); + return 0; + } + error_info->setjmp = 1; + + while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { + const int lpf_opt_level = cur_job_info->lpf_opt_level; + av1_thread_loop_filter_rows( + lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, + cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, + lpf_opt_level, lf_sync, error_info, lf_data->params_buf, + lf_data->tx_buf, MAX_MIB_SIZE_LOG2); + } + error_info->setjmp = 0; + return 1; +} + +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int start, int stop, + const int planes_to_lf[MAX_MB_PLANE], + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync, int lpf_opt_level) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int i; + loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync, + lpf_opt_level, MAX_MIB_SIZE_LOG2); + + // Set up loopfilter thread data. + for (i = num_workers - 1; i >= 0; --i) { + AVxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + + worker->hook = loop_filter_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + loop_filter_data_reset(lf_data, frame, cm, xd); + + // Start loopfiltering + worker->had_error = 0; + if (i == 0) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + sync_lf_workers(workers, cm, num_workers); +} + +static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int start, int stop, + const int planes_to_lf[MAX_MB_PLANE], + int lpf_opt_level) { + // Filter top rows of all planes first, in case the output can be partially + // reconstructed row by row. + int mi_row, plane, dir; + + AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE]; + TX_SIZE tx_buf[MAX_MIB_SIZE]; + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) { + continue; + } + + for (dir = 0; dir < 2; ++dir) { + av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane, + dir, lpf_opt_level, /*lf_sync=*/NULL, + xd->error_info, params_buf, tx_buf, + MAX_MIB_SIZE_LOG2); + } + } + } +} + +void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int plane_start, int plane_end, + int partial_frame, AVxWorker *workers, + int num_workers, AV1LfSync *lf_sync, + int lpf_opt_level) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + int planes_to_lf[MAX_MB_PLANE]; + + if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start, + plane_end)) + return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_params.mi_rows; + if (partial_frame && cm->mi_params.mi_rows > 8) { + start_mi_row = cm->mi_params.mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + av1_loop_filter_frame_init(cm, plane_start, plane_end); + + if (num_workers > 1) { + // Enqueue and execute loopfiltering jobs. + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf, + workers, num_workers, lf_sync, lpf_opt_level); + } else { + // Directly filter in the main thread. + loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf, + lpf_opt_level); + } +} + +static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) { +#if CONFIG_MULTITHREAD + AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; + const int nsync = loop_res_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1]; + pthread_mutex_lock(mutex); + + while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) { + pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lr_sync; + (void)r; + (void)c; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void lr_sync_write(void *const lr_sync, int r, int c, + const int sb_cols, int plane) { +#if CONFIG_MULTITHREAD + AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; + const int nsync = loop_res_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]); + + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + loop_res_sync->cur_sb_col[plane][r] = + AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur); + + pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]); + pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]); + } +#else + (void)lr_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +// Allocate memory for loop restoration row synchronization +void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, + int num_workers, int num_rows_lr, + int num_planes, int width) { + lr_sync->rows = num_rows_lr; + lr_sync->num_planes = num_planes; +#if CONFIG_MULTITHREAD + { + int i, j; + + for (j = 0; j < num_planes; j++) { + CHECK_MEM_ERROR(cm, lr_sync->mutex_[j], + aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr)); + if (lr_sync->mutex_[j]) { + for (i = 0; i < num_rows_lr; ++i) { + pthread_mutex_init(&lr_sync->mutex_[j][i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lr_sync->cond_[j], + aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr)); + if (lr_sync->cond_[j]) { + for (i = 0; i < num_rows_lr; ++i) { + pthread_cond_init(&lr_sync->cond_[j][i], NULL); + } + } + } + + CHECK_MEM_ERROR(cm, lr_sync->job_mutex, + aom_malloc(sizeof(*(lr_sync->job_mutex)))); + if (lr_sync->job_mutex) { + pthread_mutex_init(lr_sync->job_mutex, NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata, + aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata)))); + lr_sync->num_workers = num_workers; + + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + if (worker_idx < num_workers - 1) { + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf, + (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs, + aom_malloc(sizeof(RestorationLineBuffers))); + + } else { + lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf; + lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs; + } + } + + for (int j = 0; j < num_planes; j++) { + CHECK_MEM_ERROR( + cm, lr_sync->cur_sb_col[j], + aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr)); + } + CHECK_MEM_ERROR( + cm, lr_sync->job_queue, + aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes)); + // Set up nsync. + lr_sync->sync_range = get_lr_sync_range(width); +} + +// Deallocate loop restoration synchronization related mutex and data +void av1_loop_restoration_dealloc(AV1LrSync *lr_sync) { + if (lr_sync != NULL) { + int j; +#if CONFIG_MULTITHREAD + int i; + for (j = 0; j < MAX_MB_PLANE; j++) { + if (lr_sync->mutex_[j] != NULL) { + for (i = 0; i < lr_sync->rows; ++i) { + pthread_mutex_destroy(&lr_sync->mutex_[j][i]); + } + aom_free(lr_sync->mutex_[j]); + } + if (lr_sync->cond_[j] != NULL) { + for (i = 0; i < lr_sync->rows; ++i) { + pthread_cond_destroy(&lr_sync->cond_[j][i]); + } + aom_free(lr_sync->cond_[j]); + } + } + if (lr_sync->job_mutex != NULL) { + pthread_mutex_destroy(lr_sync->job_mutex); + aom_free(lr_sync->job_mutex); + } +#endif // CONFIG_MULTITHREAD + for (j = 0; j < MAX_MB_PLANE; j++) { + aom_free(lr_sync->cur_sb_col[j]); + } + + aom_free(lr_sync->job_queue); + + if (lr_sync->lrworkerdata) { + for (int worker_idx = 0; worker_idx < lr_sync->num_workers - 1; + worker_idx++) { + LRWorkerData *const workerdata_data = + lr_sync->lrworkerdata + worker_idx; + + aom_free(workerdata_data->rst_tmpbuf); + aom_free(workerdata_data->rlbs); + } + aom_free(lr_sync->lrworkerdata); + } + + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*lr_sync); + } +} + +static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, + AV1_COMMON *cm) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + const int num_planes = av1_num_planes(cm); + AV1LrMTInfo *lr_job_queue = lr_sync->job_queue; + int32_t lr_job_counter[2], num_even_lr_jobs = 0; + lr_sync->jobs_enqueued = 0; + lr_sync->jobs_dequeued = 0; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + num_even_lr_jobs = + num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1); + } + lr_job_counter[0] = 0; + lr_job_counter[1] = num_even_lr_jobs; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int unit_size = ctxt[plane].rsi->restoration_unit_size; + const int plane_h = ctxt[plane].plane_h; + const int ext_size = unit_size * 3 / 2; + + int y0 = 0, i = 0; + while (y0 < plane_h) { + int remaining_h = plane_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : unit_size; + + RestorationTileLimits limits; + limits.v_start = y0; + limits.v_end = y0 + h; + assert(limits.v_end <= plane_h); + // Offset upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(0, limits.v_start - voffset); + if (limits.v_end < plane_h) limits.v_end -= voffset; + + assert(lr_job_counter[0] <= num_even_lr_jobs); + + lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i; + lr_job_queue[lr_job_counter[i & 1]].plane = plane; + lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start; + lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end; + lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1; + if ((i & 1) == 0) { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + limits.v_start + RESTORATION_BORDER; + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + limits.v_end - RESTORATION_BORDER; + if (i == 0) { + assert(limits.v_start == 0); + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0; + } + if (i == (ctxt[plane].rsi->vert_units - 1)) { + assert(limits.v_end == plane_h); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h; + } + } else { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + AOMMAX(limits.v_start - RESTORATION_BORDER, 0); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h); + } + lr_job_counter[i & 1]++; + lr_sync->jobs_enqueued++; + + y0 += h; + ++i; + } + } +} + +static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { + AV1LrMTInfo *cur_job_info = NULL; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lr_sync->job_mutex); + + if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) { + cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued; + lr_sync->jobs_dequeued++; + } + + pthread_mutex_unlock(lr_sync->job_mutex); +#else + (void)lr_sync; +#endif + + return cur_job_info; +} + +static void set_loop_restoration_done(AV1LrSync *const lr_sync, + FilterFrameCtxt *const ctxt) { + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (ctxt[plane].rsi->frame_restoration_type == RESTORE_NONE) continue; + int y0 = 0, row_number = 0; + const int unit_size = ctxt[plane].rsi->restoration_unit_size; + const int plane_h = ctxt[plane].plane_h; + const int ext_size = unit_size * 3 / 2; + const int hnum_rest_units = ctxt[plane].rsi->horz_units; + while (y0 < plane_h) { + const int remaining_h = plane_h - y0; + const int h = (remaining_h < ext_size) ? remaining_h : unit_size; + lr_sync_write(lr_sync, row_number, hnum_rest_units - 1, hnum_rest_units, + plane); + y0 += h; + ++row_number; + } + } +} + +// Implement row loop restoration for each thread. +static int loop_restoration_row_worker(void *arg1, void *arg2) { + AV1LrSync *const lr_sync = (AV1LrSync *)arg1; + LRWorkerData *lrworkerdata = (LRWorkerData *)arg2; + AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt; + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + int lr_unit_row; + int plane; + int plane_w; +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex_ = lr_sync->job_mutex; +#endif + struct aom_internal_error_info *const error_info = &lrworkerdata->error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(job_mutex_); + lr_sync->lr_mt_exit = true; + pthread_mutex_unlock(job_mutex_); +#endif + // In case of loop restoration multithreading, the worker on an even lr + // block row waits for the completion of the filtering of the top-right and + // bottom-right blocks. Hence, in case a thread (main/worker) encounters an + // error, update that filtering of every row in the frame is complete in + // order to avoid the dependent workers from waiting indefinitely. + set_loop_restoration_done(lr_sync, lr_ctxt->ctxt); + return 0; + } + error_info->setjmp = 1; + + typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, + int vstart, int vend); + static const copy_fun copy_funs[MAX_MB_PLANE] = { + aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v + }; + + while (1) { + AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync); + if (cur_job_info != NULL) { + RestorationTileLimits limits; + sync_read_fn_t on_sync_read; + sync_write_fn_t on_sync_write; + limits.v_start = cur_job_info->v_start; + limits.v_end = cur_job_info->v_end; + lr_unit_row = cur_job_info->lr_unit_row; + plane = cur_job_info->plane; + plane_w = ctxt[plane].plane_w; + + // sync_mode == 1 implies only sync read is required in LR Multi-threading + // sync_mode == 0 implies only sync write is required. + on_sync_read = + cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy; + on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write + : av1_lr_sync_write_dummy; + + av1_foreach_rest_unit_in_row( + &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row, + ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units, + ctxt[plane].rsi->vert_units, plane, &ctxt[plane], + lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read, + on_sync_write, lr_sync, error_info); + + copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w, + cur_job_info->v_copy_start, cur_job_info->v_copy_end); + + if (lrworkerdata->do_extend_border) { + aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane, + cur_job_info->v_copy_start, + cur_job_info->v_copy_end); + } + } else { + break; + } + } + error_info->setjmp = 0; + return 1; +} + +static AOM_INLINE void sync_lr_workers(AVxWorker *const workers, + AV1_COMMON *const cm, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = workers[0].had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + AVxWorker *const worker = &workers[0]; + error_info = ((LRWorkerData *)worker->data2)->error_info; + } + + // Wait till all rows are finished. + for (int i = num_workers - 1; i > 0; --i) { + AVxWorker *const worker = &workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((LRWorkerData *)worker->data2)->error_info; + } + } + if (had_error) aom_internal_error_copy(cm->error, &error_info); +} + +static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, + AVxWorker *workers, int num_workers, + AV1LrSync *lr_sync, AV1_COMMON *cm, + int do_extend_border) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + const int num_planes = av1_num_planes(cm); + + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_rows_lr = 0; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + + const int plane_h = ctxt[plane].plane_h; + const int unit_size = cm->rst_info[plane].restoration_unit_size; + + num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h)); + } + + int i; + assert(MAX_MB_PLANE == 3); + + if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || + num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) { + av1_loop_restoration_dealloc(lr_sync); + av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, + num_planes, cm->width); + } + lr_sync->lr_mt_exit = false; + + // Initialize cur_sb_col to -1 for all SB rows. + for (i = 0; i < num_planes; i++) { + memset(lr_sync->cur_sb_col[i], -1, + sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr); + } + + enqueue_lr_jobs(lr_sync, lr_ctxt, cm); + + // Set up looprestoration thread data. + for (i = num_workers - 1; i >= 0; --i) { + AVxWorker *const worker = &workers[i]; + lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt; + lr_sync->lrworkerdata[i].do_extend_border = do_extend_border; + worker->hook = loop_restoration_row_worker; + worker->data1 = lr_sync; + worker->data2 = &lr_sync->lrworkerdata[i]; + + // Start loop restoration + worker->had_error = 0; + if (i == 0) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + sync_lr_workers(workers, cm, num_workers); +} + +void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + AVxWorker *workers, int num_workers, + AV1LrSync *lr_sync, void *lr_ctxt, + int do_extend_border) { + assert(!cm->features.all_lossless); + + const int num_planes = av1_num_planes(cm); + + AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; + + av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, + optimized_lr, num_planes); + + foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync, + cm, do_extend_border); +} + +// Initializes cdef_sync parameters. +static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) { + cdef_sync->end_of_frame = 0; + cdef_sync->fbr = 0; + cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; +} + +static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + worker->had_error = 0; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers, + AV1_COMMON *const cm, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = workers[0].had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + AVxWorker *const worker = &workers[0]; + error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; + } + + // Wait till all rows are finished. + for (int i = num_workers - 1; i > 0; --i) { + AVxWorker *const worker = &workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; + } + } + if (had_error) aom_internal_error_copy(cm->error, &error_info); +} + +// Updates the row index of the next job to be processed. +// Also updates end_of_frame flag when the processing of all rows is complete. +static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync, + const int nvfb) { + cdef_sync->fbr++; + if (cdef_sync->fbr == nvfb) { + cdef_sync->end_of_frame = 1; + } +} + +// Checks if a job is available. If job is available, +// populates next job information and returns 1, else returns 0. +static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync, + volatile int *cur_fbr, + const int nvfb) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + int do_next_row = 0; + // Populates information needed for current job and update the row + // index of the next row to be processed. + if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { + do_next_row = 1; + *cur_fbr = cdef_sync->fbr; + update_cdef_row_next_job_info(cdef_sync, nvfb); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + return do_next_row; +} + +static void set_cdef_init_fb_row_done(AV1CdefSync *const cdef_sync, int nvfb) { + for (int fbr = 0; fbr < nvfb; fbr++) cdef_row_mt_sync_write(cdef_sync, fbr); +} + +// Hook function for each thread in CDEF multi-threading. +static int cdef_sb_row_worker_hook(void *arg1, void *arg2) { + AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1; + AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2; + AV1_COMMON *cm = cdef_worker->cm; + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex_ = cdef_sync->mutex_; +#endif + struct aom_internal_error_info *const error_info = &cdef_worker->error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(job_mutex_); + cdef_sync->cdef_mt_exit = true; + pthread_mutex_unlock(job_mutex_); +#endif + // In case of cdef row-multithreading, the worker on a filter block row + // (fbr) waits for the line buffers (top and bottom) copy of the above row. + // Hence, in case a thread (main/worker) encounters an error before copying + // of the line buffers, update that line buffer copy is complete in order to + // avoid dependent workers waiting indefinitely. + set_cdef_init_fb_row_done(cdef_sync, nvfb); + return 0; + } + error_info->setjmp = 1; + + volatile int cur_fbr; + const int num_planes = av1_num_planes(cm); + while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) { + MACROBLOCKD *xd = cdef_worker->xd; + av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf, + cdef_worker->srcbuf, cur_fbr, + cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info); + if (cdef_worker->do_extend_border) { + for (int plane = 0; plane < num_planes; ++plane) { + const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf; + const int is_uv = plane > 0; + const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int unit_height = MI_SIZE_64X64 << mi_high; + const int v_start = cur_fbr * unit_height; + const int v_end = + AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]); + aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end); + } + } + } + error_info->setjmp = 0; + return 1; +} + +// Assigns CDEF hook function and thread data to each worker. +static void prepare_cdef_frame_workers( + AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker, + AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, + int do_extend_border) { + const int num_planes = av1_num_planes(cm); + + cdef_worker[0].srcbuf = cm->cdef_info.srcbuf; + for (int plane = 0; plane < num_planes; plane++) + cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane]; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + cdef_worker[i].cm = cm; + cdef_worker[i].xd = xd; + cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn; + cdef_worker[i].do_extend_border = do_extend_border; + for (int plane = 0; plane < num_planes; plane++) + cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane]; + + worker->hook = hook; + worker->data1 = cdef_sync; + worker->data2 = &cdef_worker[i]; + } +} + +// Initializes row-level parameters for CDEF frame. +void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr) { + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + + // for the current filter block, it's top left corner mi structure (mi_tl) + // is first accessed to check whether the top and left boundaries are + // frame boundaries. Then bottom-left and top-right mi structures are + // accessed to check whether the bottom and right boundaries + // (respectively) are frame boundaries. + // + // Note that we can't just check the bottom-right mi structure - eg. if + // we're at the right-hand edge of the frame but not the bottom, then + // the bottom-right mi is NULL but the bottom-left is not. + fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; + if (fbr != nvfb - 1) + fb_info->frame_boundary[BOTTOM] = + (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; + else + fb_info->frame_boundary[BOTTOM] = 1; + + fb_info->src = src; + fb_info->damping = cm->cdef_info.cdef_damping; + fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + av1_zero(fb_info->dir); + av1_zero(fb_info->var); + + for (int plane = 0; plane < num_planes; plane++) { + const int stride = luma_stride >> xd->plane[plane].subsampling_x; + uint16_t *top_linebuf = &linebuf[plane][0]; + uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride]; + { + const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + + if (fbr != nvfb - 1) // if (fbr != 0) // top line buffer copy + av1_cdef_copy_sb8_16( + cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride, + xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + if (fbr != nvfb - 1) // bottom line buffer copy + av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride], + stride, xd->plane[plane].dst.buf, bot_offset, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + } + + fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride]; + fb_info->bot_linebuf[plane] = + &linebuf[plane] + [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)]; + } + + cdef_row_mt_sync_write(cdef_sync, fbr); + cdef_row_mt_sync_read(cdef_sync, fbr); +} + +// Implements multi-threading for CDEF. +// Perform CDEF on input frame. +// Inputs: +// frame: Pointer to input frame buffer. +// cm: Pointer to common structure. +// xd: Pointer to common current coding block structure. +// Returns: +// Nothing will be returned. +void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, + AV1CdefWorkerData *const cdef_worker, + AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, + int do_extend_border) { + YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf; + const int num_planes = av1_num_planes(cm); + + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + + reset_cdef_job_info(cdef_sync); + prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook, + workers, cdef_sync, num_workers, + cdef_init_fb_row_fn, do_extend_border); + launch_cdef_workers(workers, num_workers); + sync_cdef_workers(workers, cm, num_workers); +} + +int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm) { + // No additional top-right delay when intraBC tool is not enabled. + if (!av1_allow_intrabc(cm)) return 0; + // Due to the hardware constraints on processing the intraBC tool with row + // multithreading, a top-right delay of 3 superblocks of size 128x128 or 5 + // superblocks of size 64x64 is mandated. However, a minimum top-right delay + // of 1 superblock is assured with 'sync_range'. Hence return only the + // additional superblock delay when the intraBC tool is enabled. + return cm->seq_params->sb_size == BLOCK_128X128 ? 2 : 4; +} diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h new file mode 100644 index 0000000000..675687dc98 --- /dev/null +++ b/third_party/aom/av1/common/thread_common.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_ +#define AOM_AV1_COMMON_THREAD_COMMON_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_loopfilter.h" +#include "av1/common/cdef.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; + +typedef struct AV1LfMTInfo { + int mi_row; + int plane; + int dir; + int lpf_opt_level; +} AV1LfMTInfo; + +// Loopfilter row synchronization +typedef struct AV1LfSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_[MAX_MB_PLANE]; + pthread_cond_t *cond_[MAX_MB_PLANE]; +#endif + // Allocate memory to store the loop-filtered superblock index in each row. + int *cur_sb_col[MAX_MB_PLANE]; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + + // Row-based parallel loopfilter data + LFWorkerData *lfdata; + int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + AV1LfMTInfo *job_queue; + int jobs_enqueued; + int jobs_dequeued; + + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool lf_mt_exit; +} AV1LfSync; + +typedef struct AV1LrMTInfo { + int v_start; + int v_end; + int lr_unit_row; + int plane; + int sync_mode; + int v_copy_start; + int v_copy_end; +} AV1LrMTInfo; + +typedef struct LoopRestorationWorkerData { + int32_t *rst_tmpbuf; + void *rlbs; + void *lr_ctxt; + int do_extend_border; + struct aom_internal_error_info error_info; +} LRWorkerData; + +// Looprestoration row synchronization +typedef struct AV1LrSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_[MAX_MB_PLANE]; + pthread_cond_t *cond_[MAX_MB_PLANE]; +#endif + // Allocate memory to store the loop-restoration block index in each row. + int *cur_sb_col[MAX_MB_PLANE]; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + int num_planes; + + int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + // Row-based parallel loopfilter data + LRWorkerData *lrworkerdata; + + AV1LrMTInfo *job_queue; + int jobs_enqueued; + int jobs_dequeued; + // Initialized to false, set to true by the worker thread that encounters + // an error in order to abort the processing of other worker threads. + bool lr_mt_exit; +} AV1LrSync; + +typedef struct AV1CdefWorker { + AV1_COMMON *cm; + MACROBLOCKD *xd; + uint16_t *colbuf[MAX_MB_PLANE]; + uint16_t *srcbuf; + uint16_t *linebuf[MAX_MB_PLANE]; + cdef_init_fb_row_t cdef_init_fb_row_fn; + int do_extend_border; + struct aom_internal_error_info error_info; +} AV1CdefWorkerData; + +typedef struct AV1CdefRowSync { +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mutex_; + pthread_cond_t *row_cond_; +#endif // CONFIG_MULTITHREAD + int is_row_done; +} AV1CdefRowSync; + +// Data related to CDEF search multi-thread synchronization. +typedef struct AV1CdefSyncData { +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif // CONFIG_MULTITHREAD + // Data related to CDEF row mt sync information + AV1CdefRowSync *cdef_row_mt; + // Flag to indicate all blocks are processed and end of frame is reached + int end_of_frame; + // Row index in units of 64x64 block + int fbr; + // Column index in units of 64x64 block + int fbc; + // Initialized to false, set to true by the worker thread that encounters + // an error in order to abort the processing of other worker threads. + bool cdef_mt_exit; +} AV1CdefSync; + +void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, + AV1CdefWorkerData *const cdef_worker, + AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, + int do_extend_border); +void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); +void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, + int dstride, const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, int hsize); +void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize); +void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize); +void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, + int num_workers); +void av1_free_cdef_sync(AV1CdefSync *cdef_sync); + +// Deallocate loopfilter synchronization related mutex and data. +void av1_loop_filter_dealloc(AV1LfSync *lf_sync); +void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, + int width, int num_workers); + +void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync, + int num_mis_in_lpf_unit_height_log2); + +void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *xd, int plane_start, + int plane_end, int partial_frame, + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync, int lpf_opt_level); + +void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int optimized_lr, AVxWorker *workers, + int num_workers, AV1LrSync *lr_sync, + void *lr_ctxt, int do_extend_border); +void av1_loop_restoration_dealloc(AV1LrSync *lr_sync); +void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, + int num_workers, int num_rows_lr, + int num_planes, int width); +int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm); + +void av1_thread_loop_filter_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane, + int dir, int lpf_opt_level, AV1LfSync *const lf_sync, + struct aom_internal_error_info *error_info, + AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2); + +static AOM_FORCE_INLINE bool skip_loop_filter_plane( + const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) { + // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both + // chroma planes together + if (lpf_opt_level == 2) { + if (plane == AOM_PLANE_Y) { + return !planes_to_lf[plane]; + } + if (plane == AOM_PLANE_U) { + // U and V are handled together + return !planes_to_lf[1] && !planes_to_lf[2]; + } + assert(plane == AOM_PLANE_V); + if (plane == AOM_PLANE_V) { + // V is handled when u is filtered + return true; + } + } + + // Normal operation mode + return !planes_to_lf[plane]; +} + +static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop, + const int planes_to_lf[MAX_MB_PLANE], + int lpf_opt_level, + int num_mis_in_lpf_unit_height) { + int mi_row, plane, dir; + AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; + lf_sync->jobs_enqueued = 0; + lf_sync->jobs_dequeued = 0; + + // Launch all vertical jobs first, as they are blocking the horizontal ones. + // Launch top row jobs for all planes first, in case the output can be + // partially reconstructed row by row. + for (dir = 0; dir < 2; ++dir) { + for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) { + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) { + continue; + } + if (!planes_to_lf[plane]) continue; + lf_job_queue->mi_row = mi_row; + lf_job_queue->plane = plane; + lf_job_queue->dir = dir; + lf_job_queue->lpf_opt_level = lpf_opt_level; + lf_job_queue++; + lf_sync->jobs_enqueued++; + } + } + } +} + +static AOM_INLINE void loop_filter_frame_mt_init( + AV1_COMMON *cm, int start_mi_row, int end_mi_row, + const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync, + int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) { + // Number of superblock rows + const int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2); + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + av1_loop_filter_dealloc(lf_sync); + av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + lf_sync->lf_mt_exit = false; + + // Initialize cur_sb_col to -1 for all SB rows. + for (int i = 0; i < MAX_MB_PLANE; i++) { + memset(lf_sync->cur_sb_col[i], -1, + sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows); + } + + enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf, + lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2)); +} + +static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) { + AV1LfMTInfo *cur_job_info = NULL; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->job_mutex); + + if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) { + cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued; + lf_sync->jobs_dequeued++; + } + + pthread_mutex_unlock(lf_sync->job_mutex); +#else + (void)lf_sync; +#endif + + return cur_job_info; +} + +static AOM_INLINE void loop_filter_data_reset(LFWorkerData *lf_data, + YV12_BUFFER_CONFIG *frame_buffer, + struct AV1Common *cm, + MACROBLOCKD *xd) { + struct macroblockd_plane *pd = xd->plane; + lf_data->frame_buffer = frame_buffer; + lf_data->cm = cm; + lf_data->xd = xd; + for (int i = 0; i < MAX_MB_PLANE; i++) { + memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst)); + lf_data->planes[i].subsampling_x = pd[i].subsampling_x; + lf_data->planes[i].subsampling_y = pd[i].subsampling_y; + } +} + +static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf, + int planes_to_lf[MAX_MB_PLANE], + int plane_start, + int plane_end) { + // For each luma and chroma plane, whether to filter it or not. + planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) && + plane_start <= 0 && 0 < plane_end; + planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end; + planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end; +} + +static AOM_INLINE int check_planes_to_loop_filter( + const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE], + int plane_start, int plane_end) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); + // If the luma plane is purposely not filtered, neither are the chroma + // planes. + if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0; + // Early exit. + if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0; + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_THREAD_COMMON_H_ diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c new file mode 100644 index 0000000000..b964f259b8 --- /dev/null +++ b/third_party/aom/av1/common/tile_common.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" +#include "aom_dsp/aom_dsp_common.h" + +void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) { + av1_tile_set_row(tile, cm, row); + av1_tile_set_col(tile, cm, col); +} + +// Find smallest k>=0 such that (blk_size << k) >= target +static int tile_log2(int blk_size, int target) { + int k; + for (k = 0; (blk_size << k) < target; k++) { + } + return k; +} + +void av1_get_tile_limits(AV1_COMMON *const cm) { + const SequenceHeader *const seq_params = cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); + const int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); + + const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2; + tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2; + +#if CONFIG_CWG_C013 + bool use_level_7_above = false; + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 && + seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) { + // Currently it is assumed that levels 7.x and 8.x are either used for all + // operating points, or none of them. + if (i != 0 && !use_level_7_above) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Either all the operating points are levels 7.x or " + "8.x, or none of them are."); + } + use_level_7_above = true; + } + } + const int max_tile_area_sb = + (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >> + (2 * sb_size_log2); +#else + const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); +#endif + + tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols); + tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS)); + tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS)); + tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows); + tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols); +} + +void av1_calculate_tile_cols(const SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + CommonTileParams *const tiles) { + int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2); + int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int i; + + // This will be overridden if there is at least two columns of tiles + // (otherwise there is no inner tile width) + tiles->min_inner_width = -1; + + if (tiles->uniform_spacing) { + int start_sb; + int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols); + assert(size_sb > 0); + for (i = 0, start_sb = 0; start_sb < sb_cols; i++) { + tiles->col_start_sb[i] = start_sb; + start_sb += size_sb; + } + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0); + tiles->max_height_sb = sb_rows >> tiles->min_log2_rows; + + tiles->width = size_sb << seq_params->mib_size_log2; + tiles->width = AOMMIN(tiles->width, cm_mi_cols); + if (tiles->cols > 1) { + tiles->min_inner_width = tiles->width; + } + } else { + int max_tile_area_sb = (sb_rows * sb_cols); + int widest_tile_sb = 1; + int narrowest_inner_tile_sb = 65536; + tiles->log2_cols = tile_log2(1, tiles->cols); + for (i = 0; i < tiles->cols; i++) { + int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + widest_tile_sb = AOMMAX(widest_tile_sb, size_sb); + // ignore the rightmost tile in frame for determining the narrowest + if (i < tiles->cols - 1) + narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb); + } + if (tiles->min_log2) { + max_tile_area_sb >>= (tiles->min_log2 + 1); + } + tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); + if (tiles->cols > 1) { + tiles->min_inner_width = narrowest_inner_tile_sb + << seq_params->mib_size_log2; + } + } +} + +void av1_calculate_tile_rows(const SequenceHeader *const seq_params, + int cm_mi_rows, CommonTileParams *const tiles) { + int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int start_sb, size_sb, i; + + if (tiles->uniform_spacing) { + size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows); + assert(size_sb > 0); + for (i = 0, start_sb = 0; start_sb < sb_rows; i++) { + tiles->row_start_sb[i] = start_sb; + start_sb += size_sb; + } + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; + + tiles->height = size_sb << seq_params->mib_size_log2; + tiles->height = AOMMIN(tiles->height, cm_mi_rows); + } else { + tiles->log2_rows = tile_log2(1, tiles->rows); + } +} + +void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { + assert(row < cm->tiles.rows); + int mi_row_start = cm->tiles.row_start_sb[row] + << cm->seq_params->mib_size_log2; + int mi_row_end = cm->tiles.row_start_sb[row + 1] + << cm->seq_params->mib_size_log2; + tile->tile_row = row; + tile->mi_row_start = mi_row_start; + tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows); + assert(tile->mi_row_end > tile->mi_row_start); +} + +void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { + assert(col < cm->tiles.cols); + int mi_col_start = cm->tiles.col_start_sb[col] + << cm->seq_params->mib_size_log2; + int mi_col_end = cm->tiles.col_start_sb[col + 1] + << cm->seq_params->mib_size_log2; + tile->tile_col = col; + tile->mi_col_start = mi_col_start; + tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols); + assert(tile->mi_col_end > tile->mi_col_start); +} + +int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) { + return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, + cm->seq_params->mib_size_log2); +} + +int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) { + return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, + cm->seq_params->mib_size_log2); +} + +PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, + int is_uv) { + PixelRect r; + + // Calculate position in the Y plane + r.left = tile_info->mi_col_start * MI_SIZE; + r.right = tile_info->mi_col_end * MI_SIZE; + r.top = tile_info->mi_row_start * MI_SIZE; + r.bottom = tile_info->mi_row_end * MI_SIZE; + + // If upscaling is enabled, the tile limits need scaling to match the + // upscaled frame where the restoration units live. To do this, scale up the + // top-left and bottom-right of the tile. + if (av1_superres_scaled(cm)) { + av1_calculate_unscaled_superres_size(&r.left, &r.top, + cm->superres_scale_denominator); + av1_calculate_unscaled_superres_size(&r.right, &r.bottom, + cm->superres_scale_denominator); + } + + const int frame_w = cm->superres_upscaled_width; + const int frame_h = cm->superres_upscaled_height; + + // Make sure we don't fall off the bottom-right of the frame. + r.right = AOMMIN(r.right, frame_w); + r.bottom = AOMMIN(r.bottom, frame_h); + + // Convert to coordinates in the appropriate plane + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + + r.left = ROUND_POWER_OF_TWO(r.left, ss_x); + r.right = ROUND_POWER_OF_TWO(r.right, ss_x); + r.top = ROUND_POWER_OF_TWO(r.top, ss_y); + r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y); + + return r; +} + +void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { + const CommonTileParams *const tiles = &cm->tiles; + if (tiles->uniform_spacing) { + *w = tiles->width; + *h = tiles->height; + } else { + for (int i = 0; i < tiles->cols; ++i) { + const int tile_width_sb = + tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + const int tile_w = tile_width_sb * cm->seq_params->mib_size; + assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + *w = tile_w; + } + + for (int i = 0; i < tiles->rows; ++i) { + const int tile_height_sb = + tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + const int tile_h = tile_height_sb * cm->seq_params->mib_size; + assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + *h = tile_h; + } + } +} + +int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) { + // Disable check if there is a single tile col in the frame + if (cm->tiles.cols == 1) return 1; + + return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >= + (64 << av1_superres_scaled(cm))); +} diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h new file mode 100644 index 0000000000..5383ae940b --- /dev/null +++ b/third_party/aom/av1/common/tile_common.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TILE_COMMON_H_ +#define AOM_AV1_COMMON_TILE_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" +#include "aom_dsp/rect.h" + +struct AV1Common; +struct SequenceHeader; +struct CommonTileParams; + +#define DEFAULT_MAX_NUM_TG 1 + +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; + int tile_row; + int tile_col; +} TileInfo; + +// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row, + int col); + +void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row); +void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col); + +int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile); +int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile); + +// Return the pixel extents of the given tile +PixelRect av1_get_tile_rect(const TileInfo *tile_info, + const struct AV1Common *cm, int is_uv); + +// Define tile maximum width and area +// There is no maximum height since height is limited by area and width limits +// The minimum tile width or height is fixed at one superblock +#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels +#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels +#if CONFIG_CWG_C013 +#define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608) +#endif + +void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); +void av1_get_tile_limits(struct AV1Common *const cm); +void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + struct CommonTileParams *const tiles); +void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params, + int cm_mi_rows, + struct CommonTileParams *const tiles); + +// Checks if the minimum tile_width requirement is satisfied +int av1_is_min_tile_width_satisfied(const struct AV1Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_TILE_COMMON_H_ diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c new file mode 100644 index 0000000000..a959cdf768 --- /dev/null +++ b/third_party/aom/av1/common/timing.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/timing.h" + +/* Tables for AV1 max bitrates for different levels of main and high tier. + * The tables are in Kbps instead of Mbps in the specification. + * Note that depending on the profile, a multiplier is needed. + */ +#define UNDEFINED_RATE \ + (1 << 21) // Placeholder rate for levels with undefined rate +#define INVALID_RATE \ + (0) // For invalid profile-level configuration, set rate to 0 + +/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */ +/* is a dummy value. The decoder model is not applicable for level 31. */ +static int32_t main_kbps[1 << LEVEL_BITS] = { + 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE, + 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE, + 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE, + 30000, 40000, 60000, 60000, + 60000, 100000, 160000, 160000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE +}; + +/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */ +/* is a dummy value. The decoder model is not applicable for level 31. */ +static int32_t high_kbps[1 << LEVEL_BITS] = { + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE, + 100000, 160000, 240000, 240000, + 240000, 480000, 800000, 800000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE +}; + +/* BitrateProfileFactor */ +static int bitrate_profile_factor[1 << PROFILE_BITS] = { + 1, 2, 3, 0, 0, 0, 0, 0 +}; + +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier) { + int64_t bitrate; + + if (seq_tier) { + bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; + } else { + bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; + } + + return bitrate * 1000; +} + +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { + decoder_model->encoder_decoder_buffer_delay_length = 16; + decoder_model->buffer_removal_time_length = 10; + decoder_model->frame_presentation_time_length = 10; +} + +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { + op_params->decoder_model_param_present_flag = 1; + op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s + op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s + op_params->low_delay_mode_flag = 0; + op_params->display_model_param_present_flag = 1; + op_params->initial_display_delay = 8; // 8 frames delay +} + +void av1_set_resource_availability_parameters( + aom_dec_model_op_parameters_t *op_params) { + op_params->decoder_model_param_present_flag = 0; + op_params->decoder_buffer_delay = + 70000; // Resource availability mode default + op_params->encoder_buffer_delay = + 20000; // Resource availability mode default + op_params->low_delay_mode_flag = 0; // Resource availability mode default + op_params->display_model_param_present_flag = 1; + op_params->initial_display_delay = 8; // 8 frames delay +} diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h new file mode 100644 index 0000000000..9192124f72 --- /dev/null +++ b/third_party/aom/av1/common/timing.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TIMING_H_ +#define AOM_AV1_COMMON_TIMING_H_ + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" + +#define MAX_NUM_OP_POINTS 32 + +typedef struct aom_timing { + uint32_t num_units_in_display_tick; + uint32_t time_scale; + int equal_picture_interval; + uint32_t num_ticks_per_picture; +} aom_timing_info_t; + +typedef struct aom_dec_model_info { + uint32_t num_units_in_decoding_tick; + int encoder_decoder_buffer_delay_length; + int buffer_removal_time_length; + int frame_presentation_time_length; +} aom_dec_model_info_t; + +typedef struct aom_dec_model_op_parameters { + int decoder_model_param_present_flag; + int64_t bitrate; + int64_t buffer_size; + uint32_t decoder_buffer_delay; + uint32_t encoder_buffer_delay; + int low_delay_mode_flag; + int display_model_param_present_flag; + int initial_display_delay; +} aom_dec_model_op_parameters_t; + +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); + +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params); + +void av1_set_resource_availability_parameters( + aom_dec_model_op_parameters_t *op_params); + +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier); + +#endif // AOM_AV1_COMMON_TIMING_H_ diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h new file mode 100644 index 0000000000..f1edda58d7 --- /dev/null +++ b/third_party/aom/av1/common/token_cdfs.h @@ -0,0 +1,3555 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_ +#define AOM_AV1_COMMON_TOKEN_CDFS_H_ + +#include "config/aom_config.h" + +#include "av1/common/entropy.h" + +static const aom_cdf_prob + av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS] + [CDF_SIZE(2)] = { + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + }; + +static const aom_cdf_prob + av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS] + [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) }, + { AOM_CDF2(5892) }, + { AOM_CDF2(12112) }, + { AOM_CDF2(21935) }, + { AOM_CDF2(20289) }, + { AOM_CDF2(27473) }, + { AOM_CDF2(32487) }, + { AOM_CDF2(7654) }, + { AOM_CDF2(19473) }, + { AOM_CDF2(29984) }, + { AOM_CDF2(9961) }, + { AOM_CDF2(30242) }, + { AOM_CDF2(32117) } }, + { { AOM_CDF2(31548) }, + { AOM_CDF2(1549) }, + { AOM_CDF2(10130) }, + { AOM_CDF2(16656) }, + { AOM_CDF2(18591) }, + { AOM_CDF2(26308) }, + { AOM_CDF2(32537) }, + { AOM_CDF2(5403) }, + { AOM_CDF2(18096) }, + { AOM_CDF2(30003) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(29957) }, + { AOM_CDF2(5391) }, + { AOM_CDF2(18039) }, + { AOM_CDF2(23566) }, + { AOM_CDF2(22431) }, + { AOM_CDF2(25822) }, + { AOM_CDF2(32197) }, + { AOM_CDF2(3778) }, + { AOM_CDF2(15336) }, + { AOM_CDF2(28981) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(17920) }, + { AOM_CDF2(1818) }, + { AOM_CDF2(7282) }, + { AOM_CDF2(25273) }, + { AOM_CDF2(10923) }, + { AOM_CDF2(31554) }, + { AOM_CDF2(32624) }, + { AOM_CDF2(1366) }, + { AOM_CDF2(15628) }, + { AOM_CDF2(30462) }, + { AOM_CDF2(146) }, + { AOM_CDF2(5132) }, + { AOM_CDF2(31657) } }, + { { AOM_CDF2(6308) }, + { AOM_CDF2(117) }, + { AOM_CDF2(1638) }, + { AOM_CDF2(2161) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(10923) }, + { AOM_CDF2(30247) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(30371) }, + { AOM_CDF2(7570) }, + { AOM_CDF2(13155) }, + { AOM_CDF2(20751) }, + { AOM_CDF2(20969) }, + { AOM_CDF2(27067) }, + { AOM_CDF2(32013) }, + { AOM_CDF2(5495) }, + { AOM_CDF2(17942) }, + { AOM_CDF2(28280) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31782) }, + { AOM_CDF2(1836) }, + { AOM_CDF2(10689) }, + { AOM_CDF2(17604) }, + { AOM_CDF2(21622) }, + { AOM_CDF2(27518) }, + { AOM_CDF2(32399) }, + { AOM_CDF2(4419) }, + { AOM_CDF2(16294) }, + { AOM_CDF2(28345) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31901) }, + { AOM_CDF2(10311) }, + { AOM_CDF2(18047) }, + { AOM_CDF2(24806) }, + { AOM_CDF2(23288) }, + { AOM_CDF2(27914) }, + { AOM_CDF2(32296) }, + { AOM_CDF2(4215) }, + { AOM_CDF2(15756) }, + { AOM_CDF2(28341) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(26726) }, + { AOM_CDF2(1045) }, + { AOM_CDF2(11703) }, + { AOM_CDF2(20590) }, + { AOM_CDF2(18554) }, + { AOM_CDF2(25970) }, + { AOM_CDF2(31938) }, + { AOM_CDF2(5583) }, + { AOM_CDF2(21313) }, + { AOM_CDF2(29390) }, + { AOM_CDF2(641) }, + { AOM_CDF2(22265) }, + { AOM_CDF2(31452) } }, + { { AOM_CDF2(26584) }, + { AOM_CDF2(188) }, + { AOM_CDF2(8847) }, + { AOM_CDF2(24519) }, + { AOM_CDF2(22938) }, + { AOM_CDF2(30583) }, + { AOM_CDF2(32608) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(29614) }, + { AOM_CDF2(9068) }, + { AOM_CDF2(12924) }, + { AOM_CDF2(19538) }, + { AOM_CDF2(17737) }, + { AOM_CDF2(24619) }, + { AOM_CDF2(30642) }, + { AOM_CDF2(4119) }, + { AOM_CDF2(16026) }, + { AOM_CDF2(25657) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31957) }, + { AOM_CDF2(3230) }, + { AOM_CDF2(11153) }, + { AOM_CDF2(18123) }, + { AOM_CDF2(20143) }, + { AOM_CDF2(26536) }, + { AOM_CDF2(31986) }, + { AOM_CDF2(3050) }, + { AOM_CDF2(14603) }, + { AOM_CDF2(25155) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(32363) }, + { AOM_CDF2(10692) }, + { AOM_CDF2(19090) }, + { AOM_CDF2(24357) }, + { AOM_CDF2(24442) }, + { AOM_CDF2(28312) }, + { AOM_CDF2(32169) }, + { AOM_CDF2(3648) }, + { AOM_CDF2(15690) }, + { AOM_CDF2(26815) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(30669) }, + { AOM_CDF2(3832) }, + { AOM_CDF2(11663) }, + { AOM_CDF2(18889) }, + { AOM_CDF2(19782) }, + { AOM_CDF2(23313) }, + { AOM_CDF2(31330) }, + { AOM_CDF2(5124) }, + { AOM_CDF2(18719) }, + { AOM_CDF2(28468) }, + { AOM_CDF2(3082) }, + { AOM_CDF2(20982) }, + { AOM_CDF2(29443) } }, + { { AOM_CDF2(28573) }, + { AOM_CDF2(3183) }, + { AOM_CDF2(17802) }, + { AOM_CDF2(25977) }, + { AOM_CDF2(26677) }, + { AOM_CDF2(27832) }, + { AOM_CDF2(32387) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(26887) }, + { AOM_CDF2(6729) }, + { AOM_CDF2(10361) }, + { AOM_CDF2(17442) }, + { AOM_CDF2(15045) }, + { AOM_CDF2(22478) }, + { AOM_CDF2(29072) }, + { AOM_CDF2(2713) }, + { AOM_CDF2(11861) }, + { AOM_CDF2(20773) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31903) }, + { AOM_CDF2(2044) }, + { AOM_CDF2(7528) }, + { AOM_CDF2(14618) }, + { AOM_CDF2(16182) }, + { AOM_CDF2(24168) }, + { AOM_CDF2(31037) }, + { AOM_CDF2(2786) }, + { AOM_CDF2(11194) }, + { AOM_CDF2(20155) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(32510) }, + { AOM_CDF2(8430) }, + { AOM_CDF2(17318) }, + { AOM_CDF2(24154) }, + { AOM_CDF2(23674) }, + { AOM_CDF2(28789) }, + { AOM_CDF2(32139) }, + { AOM_CDF2(3440) }, + { AOM_CDF2(13117) }, + { AOM_CDF2(22702) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31671) }, + { AOM_CDF2(2056) }, + { AOM_CDF2(11746) }, + { AOM_CDF2(16852) }, + { AOM_CDF2(18635) }, + { AOM_CDF2(24715) }, + { AOM_CDF2(31484) }, + { AOM_CDF2(4656) }, + { AOM_CDF2(16074) }, + { AOM_CDF2(24704) }, + { AOM_CDF2(1806) }, + { AOM_CDF2(14645) }, + { AOM_CDF2(25336) } }, + { { AOM_CDF2(31539) }, + { AOM_CDF2(8433) }, + { AOM_CDF2(20576) }, + { AOM_CDF2(27904) }, + { AOM_CDF2(27852) }, + { AOM_CDF2(30026) }, + { AOM_CDF2(32441) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } } }; + +static const aom_cdf_prob + av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = { + { { { + { AOM_CDF2(16961) }, + { AOM_CDF2(17223) }, + { AOM_CDF2(7621) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(19069) }, + { AOM_CDF2(22525) }, + { AOM_CDF2(13377) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20401) }, + { AOM_CDF2(17025) }, + { AOM_CDF2(12845) }, + { AOM_CDF2(12873) }, + { AOM_CDF2(14094) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20681) }, + { AOM_CDF2(20701) }, + { AOM_CDF2(15250) }, + { AOM_CDF2(15017) }, + { AOM_CDF2(14928) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(23905) }, + { AOM_CDF2(17194) }, + { AOM_CDF2(16170) }, + { AOM_CDF2(17695) }, + { AOM_CDF2(13826) }, + { AOM_CDF2(15810) }, + { AOM_CDF2(12036) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(23959) }, + { AOM_CDF2(20799) }, + { AOM_CDF2(19021) }, + { AOM_CDF2(16203) }, + { AOM_CDF2(17886) }, + { AOM_CDF2(14144) }, + { AOM_CDF2(12010) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(27399) }, + { AOM_CDF2(16327) }, + { AOM_CDF2(18071) }, + { AOM_CDF2(19584) }, + { AOM_CDF2(20721) }, + { AOM_CDF2(18432) }, + { AOM_CDF2(19560) }, + { AOM_CDF2(10150) }, + { AOM_CDF2(8805) }, + }, + { + { AOM_CDF2(24932) }, + { AOM_CDF2(20833) }, + { AOM_CDF2(12027) }, + { AOM_CDF2(16670) }, + { AOM_CDF2(19914) }, + { AOM_CDF2(15106) }, + { AOM_CDF2(17662) }, + { AOM_CDF2(13783) }, + { AOM_CDF2(28756) }, + } }, + { { + { AOM_CDF2(23406) }, + { AOM_CDF2(21845) }, + { AOM_CDF2(18432) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(17096) }, + { AOM_CDF2(12561) }, + { AOM_CDF2(17320) }, + { AOM_CDF2(22395) }, + { AOM_CDF2(21370) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(17471) }, + { AOM_CDF2(20223) }, + { AOM_CDF2(11357) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20335) }, + { AOM_CDF2(21667) }, + { AOM_CDF2(14818) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20430) }, + { AOM_CDF2(20662) }, + { AOM_CDF2(15367) }, + { AOM_CDF2(16970) }, + { AOM_CDF2(14657) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22117) }, + { AOM_CDF2(22028) }, + { AOM_CDF2(18650) }, + { AOM_CDF2(16042) }, + { AOM_CDF2(15885) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(22409) }, + { AOM_CDF2(21012) }, + { AOM_CDF2(15650) }, + { AOM_CDF2(17395) }, + { AOM_CDF2(15469) }, + { AOM_CDF2(20205) }, + { AOM_CDF2(19511) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(24220) }, + { AOM_CDF2(22480) }, + { AOM_CDF2(17737) }, + { AOM_CDF2(18916) }, + { AOM_CDF2(19268) }, + { AOM_CDF2(18412) }, + { AOM_CDF2(18844) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(25991) }, + { AOM_CDF2(20314) }, + { AOM_CDF2(17731) }, + { AOM_CDF2(19678) }, + { AOM_CDF2(18649) }, + { AOM_CDF2(17307) }, + { AOM_CDF2(21798) }, + { AOM_CDF2(17549) }, + { AOM_CDF2(15630) }, + }, + { + { AOM_CDF2(26585) }, + { AOM_CDF2(21469) }, + { AOM_CDF2(20432) }, + { AOM_CDF2(17735) }, + { AOM_CDF2(19280) }, + { AOM_CDF2(15235) }, + { AOM_CDF2(20297) }, + { AOM_CDF2(22471) }, + { AOM_CDF2(28997) }, + } }, + { { + { AOM_CDF2(26605) }, + { AOM_CDF2(11304) }, + { AOM_CDF2(16726) }, + { AOM_CDF2(16560) }, + { AOM_CDF2(20866) }, + { AOM_CDF2(23524) }, + { AOM_CDF2(19878) }, + { AOM_CDF2(13469) }, + { AOM_CDF2(23084) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(18983) }, + { AOM_CDF2(20512) }, + { AOM_CDF2(14885) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20090) }, + { AOM_CDF2(19444) }, + { AOM_CDF2(17286) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19139) }, + { AOM_CDF2(21487) }, + { AOM_CDF2(18959) }, + { AOM_CDF2(20910) }, + { AOM_CDF2(19089) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20536) }, + { AOM_CDF2(20664) }, + { AOM_CDF2(20625) }, + { AOM_CDF2(19123) }, + { AOM_CDF2(14862) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19833) }, + { AOM_CDF2(21502) }, + { AOM_CDF2(17485) }, + { AOM_CDF2(20267) }, + { AOM_CDF2(18353) }, + { AOM_CDF2(23329) }, + { AOM_CDF2(21478) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22041) }, + { AOM_CDF2(23434) }, + { AOM_CDF2(20001) }, + { AOM_CDF2(20554) }, + { AOM_CDF2(20951) }, + { AOM_CDF2(20145) }, + { AOM_CDF2(15562) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(23312) }, + { AOM_CDF2(21607) }, + { AOM_CDF2(16526) }, + { AOM_CDF2(18957) }, + { AOM_CDF2(18034) }, + { AOM_CDF2(18934) }, + { AOM_CDF2(24247) }, + { AOM_CDF2(16921) }, + { AOM_CDF2(17080) }, + }, + { + { AOM_CDF2(26579) }, + { AOM_CDF2(24910) }, + { AOM_CDF2(18637) }, + { AOM_CDF2(19800) }, + { AOM_CDF2(20388) }, + { AOM_CDF2(9887) }, + { AOM_CDF2(15642) }, + { AOM_CDF2(30198) }, + { AOM_CDF2(24721) }, + } }, + { { + { AOM_CDF2(26998) }, + { AOM_CDF2(16737) }, + { AOM_CDF2(17838) }, + { AOM_CDF2(18922) }, + { AOM_CDF2(19515) }, + { AOM_CDF2(18636) }, + { AOM_CDF2(17333) }, + { AOM_CDF2(15776) }, + { AOM_CDF2(22658) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(20177) }, + { AOM_CDF2(20789) }, + { AOM_CDF2(20262) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(21416) }, + { AOM_CDF2(20855) }, + { AOM_CDF2(23410) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20238) }, + { AOM_CDF2(21057) }, + { AOM_CDF2(19159) }, + { AOM_CDF2(22337) }, + { AOM_CDF2(20159) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20125) }, + { AOM_CDF2(20559) }, + { AOM_CDF2(21707) }, + { AOM_CDF2(22296) }, + { AOM_CDF2(17333) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19941) }, + { AOM_CDF2(20527) }, + { AOM_CDF2(21470) }, + { AOM_CDF2(22487) }, + { AOM_CDF2(19558) }, + { AOM_CDF2(22354) }, + { AOM_CDF2(20331) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22752) }, + { AOM_CDF2(25006) }, + { AOM_CDF2(22075) }, + { AOM_CDF2(21576) }, + { AOM_CDF2(17740) }, + { AOM_CDF2(21690) }, + { AOM_CDF2(19211) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(21442) }, + { AOM_CDF2(22358) }, + { AOM_CDF2(18503) }, + { AOM_CDF2(20291) }, + { AOM_CDF2(19945) }, + { AOM_CDF2(21294) }, + { AOM_CDF2(21178) }, + { AOM_CDF2(19400) }, + { AOM_CDF2(10556) }, + }, + { + { AOM_CDF2(24648) }, + { AOM_CDF2(24949) }, + { AOM_CDF2(20708) }, + { AOM_CDF2(23905) }, + { AOM_CDF2(20501) }, + { AOM_CDF2(9558) }, + { AOM_CDF2(9423) }, + { AOM_CDF2(30365) }, + { AOM_CDF2(19253) }, + } }, + { { + { AOM_CDF2(26064) }, + { AOM_CDF2(22098) }, + { AOM_CDF2(19613) }, + { AOM_CDF2(20525) }, + { AOM_CDF2(17595) }, + { AOM_CDF2(16618) }, + { AOM_CDF2(20497) }, + { AOM_CDF2(18989) }, + { AOM_CDF2(15513) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) }, + { AOM_CDF5(370, 671, 1883, 4471) } }, + { { AOM_CDF5(3247, 4950, 9688, 14563) }, + { AOM_CDF5(1904, 3354, 7763, 14647) } } }, + { { { AOM_CDF5(2125, 2551, 5165, 8946) }, + { AOM_CDF5(513, 765, 1859, 6339) } }, + { { AOM_CDF5(7637, 9498, 14259, 19108) }, + { AOM_CDF5(2497, 4096, 8866, 16993) } } }, + { { { AOM_CDF5(4016, 4897, 8881, 14968) }, + { AOM_CDF5(716, 1105, 2646, 10056) } }, + { { AOM_CDF5(11139, 13270, 18241, 23566) }, + { AOM_CDF5(3192, 5032, 10297, 19755) } } }, + { { { AOM_CDF5(6708, 8958, 14746, 22133) }, + { AOM_CDF5(1222, 2074, 4783, 15410) } }, + { { AOM_CDF5(19575, 21766, 26044, 29709) }, + { AOM_CDF5(7297, 10767, 19273, 28194) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) }, + { AOM_CDF6(210, 405, 1315, 3326, 7537) } }, + { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) }, + { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } }, + { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) }, + { AOM_CDF6(313, 441, 1099, 2917, 8562) } }, + { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) }, + { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } }, + { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) }, + { AOM_CDF6(574, 821, 1836, 5089, 13128) } }, + { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) }, + { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } }, + { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) }, + { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } }, + { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) }, + { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) }, + { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } }, + { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) }, + { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } }, + { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) }, + { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } }, + { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) }, + { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } }, + { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) }, + { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } }, + { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) }, + { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } }, + { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) }, + { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } }, + { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) }, + { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 8)] = { + { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) }, + { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } }, + { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) }, + { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } }, + { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) }, + { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } }, + { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) }, + { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } }, + { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) }, + { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } }, + { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) }, + { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } }, + { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) }, + { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } }, + { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) }, + { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 9)] = { + { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) }, + { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } }, + { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) }, + { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } }, + { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) }, + { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } }, + { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) }, + { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } }, + { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) }, + { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } }, + { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) }, + { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842, + 32708) } } }, + { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) }, + { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } }, + { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) }, + { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403, + 32695) } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788, + 23412, 26061) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919, + 26129, 29140) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590, + 24584, 28749) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478, + 28396, 31811) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267, + 28410, 31078) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812, + 27300, 29219, 32114) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456, + 31142, 32060) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716, + 30073, 30820, 31956) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047, + 22571, 25830) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354, + 27255, 28546, 31784) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851, + 21856, 25692, 28034) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527, + 28027, 28377, 30876) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155, + 26682, 29229, 31045) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601, + 25483, 25843, 32056) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434, + 29326, 31082, 32050) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913, + 29486, 29724, 29807, 32570) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } } }; + +static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] + [CDF_SIZE(BR_CDF_SIZE)] = { + { { { { AOM_CDF4(14298, 20718, 24174) }, + { AOM_CDF4(12536, 19601, 23789) }, + { AOM_CDF4(8712, 15051, 19503) }, + { AOM_CDF4(6170, 11327, 15434) }, + { AOM_CDF4(4742, 8926, 12538) }, + { AOM_CDF4(3803, 7317, 10546) }, + { AOM_CDF4(1696, 3317, 4871) }, + { AOM_CDF4(14392, 19951, 22756) }, + { AOM_CDF4(15978, 23218, 26818) }, + { AOM_CDF4(12187, 19474, 23889) }, + { AOM_CDF4(9176, 15640, 20259) }, + { AOM_CDF4(7068, 12655, 17028) }, + { AOM_CDF4(5656, 10442, 14472) }, + { AOM_CDF4(2580, 4992, 7244) }, + { AOM_CDF4(12136, 18049, 21426) }, + { AOM_CDF4(13784, 20721, 24481) }, + { AOM_CDF4(10836, 17621, 21900) }, + { AOM_CDF4(8372, 14444, 18847) }, + { AOM_CDF4(6523, 11779, 16000) }, + { AOM_CDF4(5337, 9898, 13760) }, + { AOM_CDF4(3034, 5860, 8462) } }, + { { AOM_CDF4(15967, 22905, 26286) }, + { AOM_CDF4(13534, 20654, 24579) }, + { AOM_CDF4(9504, 16092, 20535) }, + { AOM_CDF4(6975, 12568, 16903) }, + { AOM_CDF4(5364, 10091, 14020) }, + { AOM_CDF4(4357, 8370, 11857) }, + { AOM_CDF4(2506, 4934, 7218) }, + { AOM_CDF4(23032, 28815, 30936) }, + { AOM_CDF4(19540, 26704, 29719) }, + { AOM_CDF4(15158, 22969, 27097) }, + { AOM_CDF4(11408, 18865, 23650) }, + { AOM_CDF4(8885, 15448, 20250) }, + { AOM_CDF4(7108, 12853, 17416) }, + { AOM_CDF4(4231, 8041, 11480) }, + { AOM_CDF4(19823, 26490, 29156) }, + { AOM_CDF4(18890, 25929, 28932) }, + { AOM_CDF4(15660, 23491, 27433) }, + { AOM_CDF4(12147, 19776, 24488) }, + { AOM_CDF4(9728, 16774, 21649) }, + { AOM_CDF4(7919, 14277, 19066) }, + { AOM_CDF4(5440, 10170, 14185) } } }, + { { { AOM_CDF4(14406, 20862, 24414) }, + { AOM_CDF4(11824, 18907, 23109) }, + { AOM_CDF4(8257, 14393, 18803) }, + { AOM_CDF4(5860, 10747, 14778) }, + { AOM_CDF4(4475, 8486, 11984) }, + { AOM_CDF4(3606, 6954, 10043) }, + { AOM_CDF4(1736, 3410, 5048) }, + { AOM_CDF4(14430, 20046, 22882) }, + { AOM_CDF4(15593, 22899, 26709) }, + { AOM_CDF4(12102, 19368, 23811) }, + { AOM_CDF4(9059, 15584, 20262) }, + { AOM_CDF4(6999, 12603, 17048) }, + { AOM_CDF4(5684, 10497, 14553) }, + { AOM_CDF4(2822, 5438, 7862) }, + { AOM_CDF4(15785, 21585, 24359) }, + { AOM_CDF4(18347, 25229, 28266) }, + { AOM_CDF4(14974, 22487, 26389) }, + { AOM_CDF4(11423, 18681, 23271) }, + { AOM_CDF4(8863, 15350, 20008) }, + { AOM_CDF4(7153, 12852, 17278) }, + { AOM_CDF4(3707, 7036, 9982) } }, + { { AOM_CDF4(15460, 21696, 25469) }, + { AOM_CDF4(12170, 19249, 23191) }, + { AOM_CDF4(8723, 15027, 19332) }, + { AOM_CDF4(6428, 11704, 15874) }, + { AOM_CDF4(4922, 9292, 13052) }, + { AOM_CDF4(4139, 7695, 11010) }, + { AOM_CDF4(2291, 4508, 6598) }, + { AOM_CDF4(19856, 26920, 29828) }, + { AOM_CDF4(17923, 25289, 28792) }, + { AOM_CDF4(14278, 21968, 26297) }, + { AOM_CDF4(10910, 18136, 22950) }, + { AOM_CDF4(8423, 14815, 19627) }, + { AOM_CDF4(6771, 12283, 16774) }, + { AOM_CDF4(4074, 7750, 11081) }, + { AOM_CDF4(19852, 26074, 28672) }, + { AOM_CDF4(19371, 26110, 28989) }, + { AOM_CDF4(16265, 23873, 27663) }, + { AOM_CDF4(12758, 20378, 24952) }, + { AOM_CDF4(10095, 17098, 21961) }, + { AOM_CDF4(8250, 14628, 19451) }, + { AOM_CDF4(5205, 9745, 13622) } } }, + { { { AOM_CDF4(10563, 16233, 19763) }, + { AOM_CDF4(9794, 16022, 19804) }, + { AOM_CDF4(6750, 11945, 15759) }, + { AOM_CDF4(4963, 9186, 12752) }, + { AOM_CDF4(3845, 7435, 10627) }, + { AOM_CDF4(3051, 6085, 8834) }, + { AOM_CDF4(1311, 2596, 3830) }, + { AOM_CDF4(11246, 16404, 19689) }, + { AOM_CDF4(12315, 18911, 22731) }, + { AOM_CDF4(10557, 17095, 21289) }, + { AOM_CDF4(8136, 14006, 18249) }, + { AOM_CDF4(6348, 11474, 15565) }, + { AOM_CDF4(5196, 9655, 13400) }, + { AOM_CDF4(2349, 4526, 6587) }, + { AOM_CDF4(13337, 18730, 21569) }, + { AOM_CDF4(19306, 26071, 28882) }, + { AOM_CDF4(15952, 23540, 27254) }, + { AOM_CDF4(12409, 19934, 24430) }, + { AOM_CDF4(9760, 16706, 21389) }, + { AOM_CDF4(8004, 14220, 18818) }, + { AOM_CDF4(4138, 7794, 10961) } }, + { { AOM_CDF4(10870, 16684, 20949) }, + { AOM_CDF4(9664, 15230, 18680) }, + { AOM_CDF4(6886, 12109, 15408) }, + { AOM_CDF4(4825, 8900, 12305) }, + { AOM_CDF4(3630, 7162, 10314) }, + { AOM_CDF4(3036, 6429, 9387) }, + { AOM_CDF4(1671, 3296, 4940) }, + { AOM_CDF4(13819, 19159, 23026) }, + { AOM_CDF4(11984, 19108, 23120) }, + { AOM_CDF4(10690, 17210, 21663) }, + { AOM_CDF4(7984, 14154, 18333) }, + { AOM_CDF4(6868, 12294, 16124) }, + { AOM_CDF4(5274, 8994, 12868) }, + { AOM_CDF4(2988, 5771, 8424) }, + { AOM_CDF4(19736, 26647, 29141) }, + { AOM_CDF4(18933, 26070, 28984) }, + { AOM_CDF4(15779, 23048, 27200) }, + { AOM_CDF4(12638, 20061, 24532) }, + { AOM_CDF4(10692, 17545, 22220) }, + { AOM_CDF4(9217, 15251, 20054) }, + { AOM_CDF4(5078, 9284, 12594) } } }, + { { { AOM_CDF4(2331, 3662, 5244) }, + { AOM_CDF4(2891, 4771, 6145) }, + { AOM_CDF4(4598, 7623, 9729) }, + { AOM_CDF4(3520, 6845, 9199) }, + { AOM_CDF4(3417, 6119, 9324) }, + { AOM_CDF4(2601, 5412, 7385) }, + { AOM_CDF4(600, 1173, 1744) }, + { AOM_CDF4(7672, 13286, 17469) }, + { AOM_CDF4(4232, 7792, 10793) }, + { AOM_CDF4(2915, 5317, 7397) }, + { AOM_CDF4(2318, 4356, 6152) }, + { AOM_CDF4(2127, 4000, 5554) }, + { AOM_CDF4(1850, 3478, 5275) }, + { AOM_CDF4(977, 1933, 2843) }, + { AOM_CDF4(18280, 24387, 27989) }, + { AOM_CDF4(15852, 22671, 26185) }, + { AOM_CDF4(13845, 20951, 24789) }, + { AOM_CDF4(11055, 17966, 22129) }, + { AOM_CDF4(9138, 15422, 19801) }, + { AOM_CDF4(7454, 13145, 17456) }, + { AOM_CDF4(3370, 6393, 9013) } }, + { { AOM_CDF4(5842, 9229, 10838) }, + { AOM_CDF4(2313, 3491, 4276) }, + { AOM_CDF4(2998, 6104, 7496) }, + { AOM_CDF4(2420, 7447, 9868) }, + { AOM_CDF4(3034, 8495, 10923) }, + { AOM_CDF4(4076, 8937, 10975) }, + { AOM_CDF4(1086, 2370, 3299) }, + { AOM_CDF4(9714, 17254, 20444) }, + { AOM_CDF4(8543, 13698, 17123) }, + { AOM_CDF4(4918, 9007, 11910) }, + { AOM_CDF4(4129, 7532, 10553) }, + { AOM_CDF4(2364, 5533, 8058) }, + { AOM_CDF4(1834, 3546, 5563) }, + { AOM_CDF4(1473, 2908, 4133) }, + { AOM_CDF4(15405, 21193, 25619) }, + { AOM_CDF4(15691, 21952, 26561) }, + { AOM_CDF4(12962, 19194, 24165) }, + { AOM_CDF4(10272, 17855, 22129) }, + { AOM_CDF4(8588, 15270, 20718) }, + { AOM_CDF4(8682, 14669, 19500) }, + { AOM_CDF4(4870, 9636, 13205) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(14995, 21341, 24749) }, + { AOM_CDF4(13158, 20289, 24601) }, + { AOM_CDF4(8941, 15326, 19876) }, + { AOM_CDF4(6297, 11541, 15807) }, + { AOM_CDF4(4817, 9029, 12776) }, + { AOM_CDF4(3731, 7273, 10627) }, + { AOM_CDF4(1847, 3617, 5354) }, + { AOM_CDF4(14472, 19659, 22343) }, + { AOM_CDF4(16806, 24162, 27533) }, + { AOM_CDF4(12900, 20404, 24713) }, + { AOM_CDF4(9411, 16112, 20797) }, + { AOM_CDF4(7056, 12697, 17148) }, + { AOM_CDF4(5544, 10339, 14460) }, + { AOM_CDF4(2954, 5704, 8319) }, + { AOM_CDF4(12464, 18071, 21354) }, + { AOM_CDF4(15482, 22528, 26034) }, + { AOM_CDF4(12070, 19269, 23624) }, + { AOM_CDF4(8953, 15406, 20106) }, + { AOM_CDF4(7027, 12730, 17220) }, + { AOM_CDF4(5887, 10913, 15140) }, + { AOM_CDF4(3793, 7278, 10447) } }, + { { AOM_CDF4(15571, 22232, 25749) }, + { AOM_CDF4(14506, 21575, 25374) }, + { AOM_CDF4(10189, 17089, 21569) }, + { AOM_CDF4(7316, 13301, 17915) }, + { AOM_CDF4(5783, 10912, 15190) }, + { AOM_CDF4(4760, 9155, 13088) }, + { AOM_CDF4(2993, 5966, 8774) }, + { AOM_CDF4(23424, 28903, 30778) }, + { AOM_CDF4(20775, 27666, 30290) }, + { AOM_CDF4(16474, 24410, 28299) }, + { AOM_CDF4(12471, 20180, 24987) }, + { AOM_CDF4(9410, 16487, 21439) }, + { AOM_CDF4(7536, 13614, 18529) }, + { AOM_CDF4(5048, 9586, 13549) }, + { AOM_CDF4(21090, 27290, 29756) }, + { AOM_CDF4(20796, 27402, 30026) }, + { AOM_CDF4(17819, 25485, 28969) }, + { AOM_CDF4(13860, 21909, 26462) }, + { AOM_CDF4(11002, 18494, 23529) }, + { AOM_CDF4(8953, 15929, 20897) }, + { AOM_CDF4(6448, 11918, 16454) } } }, + { { { AOM_CDF4(15999, 22208, 25449) }, + { AOM_CDF4(13050, 19988, 24122) }, + { AOM_CDF4(8594, 14864, 19378) }, + { AOM_CDF4(6033, 11079, 15238) }, + { AOM_CDF4(4554, 8683, 12347) }, + { AOM_CDF4(3672, 7139, 10337) }, + { AOM_CDF4(1900, 3771, 5576) }, + { AOM_CDF4(15788, 21340, 23949) }, + { AOM_CDF4(16825, 24235, 27758) }, + { AOM_CDF4(12873, 20402, 24810) }, + { AOM_CDF4(9590, 16363, 21094) }, + { AOM_CDF4(7352, 13209, 17733) }, + { AOM_CDF4(5960, 10989, 15184) }, + { AOM_CDF4(3232, 6234, 9007) }, + { AOM_CDF4(15761, 20716, 23224) }, + { AOM_CDF4(19318, 25989, 28759) }, + { AOM_CDF4(15529, 23094, 26929) }, + { AOM_CDF4(11662, 18989, 23641) }, + { AOM_CDF4(8955, 15568, 20366) }, + { AOM_CDF4(7281, 13106, 17708) }, + { AOM_CDF4(4248, 8059, 11440) } }, + { { AOM_CDF4(14899, 21217, 24503) }, + { AOM_CDF4(13519, 20283, 24047) }, + { AOM_CDF4(9429, 15966, 20365) }, + { AOM_CDF4(6700, 12355, 16652) }, + { AOM_CDF4(5088, 9704, 13716) }, + { AOM_CDF4(4243, 8154, 11731) }, + { AOM_CDF4(2702, 5364, 7861) }, + { AOM_CDF4(22745, 28388, 30454) }, + { AOM_CDF4(20235, 27146, 29922) }, + { AOM_CDF4(15896, 23715, 27637) }, + { AOM_CDF4(11840, 19350, 24131) }, + { AOM_CDF4(9122, 15932, 20880) }, + { AOM_CDF4(7488, 13581, 18362) }, + { AOM_CDF4(5114, 9568, 13370) }, + { AOM_CDF4(20845, 26553, 28932) }, + { AOM_CDF4(20981, 27372, 29884) }, + { AOM_CDF4(17781, 25335, 28785) }, + { AOM_CDF4(13760, 21708, 26297) }, + { AOM_CDF4(10975, 18415, 23365) }, + { AOM_CDF4(9045, 15789, 20686) }, + { AOM_CDF4(6130, 11199, 15423) } } }, + { { { AOM_CDF4(13549, 19724, 23158) }, + { AOM_CDF4(11844, 18382, 22246) }, + { AOM_CDF4(7919, 13619, 17773) }, + { AOM_CDF4(5486, 10143, 13946) }, + { AOM_CDF4(4166, 7983, 11324) }, + { AOM_CDF4(3364, 6506, 9427) }, + { AOM_CDF4(1598, 3160, 4674) }, + { AOM_CDF4(15281, 20979, 23781) }, + { AOM_CDF4(14939, 22119, 25952) }, + { AOM_CDF4(11363, 18407, 22812) }, + { AOM_CDF4(8609, 14857, 19370) }, + { AOM_CDF4(6737, 12184, 16480) }, + { AOM_CDF4(5506, 10263, 14262) }, + { AOM_CDF4(2990, 5786, 8380) }, + { AOM_CDF4(20249, 25253, 27417) }, + { AOM_CDF4(21070, 27518, 30001) }, + { AOM_CDF4(16854, 24469, 28074) }, + { AOM_CDF4(12864, 20486, 25000) }, + { AOM_CDF4(9962, 16978, 21778) }, + { AOM_CDF4(8074, 14338, 19048) }, + { AOM_CDF4(4494, 8479, 11906) } }, + { { AOM_CDF4(13960, 19617, 22829) }, + { AOM_CDF4(11150, 17341, 21228) }, + { AOM_CDF4(7150, 12964, 17190) }, + { AOM_CDF4(5331, 10002, 13867) }, + { AOM_CDF4(4167, 7744, 11057) }, + { AOM_CDF4(3480, 6629, 9646) }, + { AOM_CDF4(1883, 3784, 5686) }, + { AOM_CDF4(18752, 25660, 28912) }, + { AOM_CDF4(16968, 24586, 28030) }, + { AOM_CDF4(13520, 21055, 25313) }, + { AOM_CDF4(10453, 17626, 22280) }, + { AOM_CDF4(8386, 14505, 19116) }, + { AOM_CDF4(6742, 12595, 17008) }, + { AOM_CDF4(4273, 8140, 11499) }, + { AOM_CDF4(22120, 27827, 30233) }, + { AOM_CDF4(20563, 27358, 29895) }, + { AOM_CDF4(17076, 24644, 28153) }, + { AOM_CDF4(13362, 20942, 25309) }, + { AOM_CDF4(10794, 17965, 22695) }, + { AOM_CDF4(9014, 15652, 20319) }, + { AOM_CDF4(5708, 10512, 14497) } } }, + { { { AOM_CDF4(5705, 10930, 15725) }, + { AOM_CDF4(7946, 12765, 16115) }, + { AOM_CDF4(6801, 12123, 16226) }, + { AOM_CDF4(5462, 10135, 14200) }, + { AOM_CDF4(4189, 8011, 11507) }, + { AOM_CDF4(3191, 6229, 9408) }, + { AOM_CDF4(1057, 2137, 3212) }, + { AOM_CDF4(10018, 17067, 21491) }, + { AOM_CDF4(7380, 12582, 16453) }, + { AOM_CDF4(6068, 10845, 14339) }, + { AOM_CDF4(5098, 9198, 12555) }, + { AOM_CDF4(4312, 8010, 11119) }, + { AOM_CDF4(3700, 6966, 9781) }, + { AOM_CDF4(1693, 3326, 4887) }, + { AOM_CDF4(18757, 24930, 27774) }, + { AOM_CDF4(17648, 24596, 27817) }, + { AOM_CDF4(14707, 22052, 26026) }, + { AOM_CDF4(11720, 18852, 23292) }, + { AOM_CDF4(9357, 15952, 20525) }, + { AOM_CDF4(7810, 13753, 18210) }, + { AOM_CDF4(3879, 7333, 10328) } }, + { { AOM_CDF4(8278, 13242, 15922) }, + { AOM_CDF4(10547, 15867, 18919) }, + { AOM_CDF4(9106, 15842, 20609) }, + { AOM_CDF4(6833, 13007, 17218) }, + { AOM_CDF4(4811, 9712, 13923) }, + { AOM_CDF4(3985, 7352, 11128) }, + { AOM_CDF4(1688, 3458, 5262) }, + { AOM_CDF4(12951, 21861, 26510) }, + { AOM_CDF4(9788, 16044, 20276) }, + { AOM_CDF4(6309, 11244, 14870) }, + { AOM_CDF4(5183, 9349, 12566) }, + { AOM_CDF4(4389, 8229, 11492) }, + { AOM_CDF4(3633, 6945, 10620) }, + { AOM_CDF4(3600, 6847, 9907) }, + { AOM_CDF4(21748, 28137, 30255) }, + { AOM_CDF4(19436, 26581, 29560) }, + { AOM_CDF4(16359, 24201, 27953) }, + { AOM_CDF4(13961, 21693, 25871) }, + { AOM_CDF4(11544, 18686, 23322) }, + { AOM_CDF4(9372, 16462, 20952) }, + { AOM_CDF4(6138, 11210, 15390) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(16138, 22223, 25509) }, + { AOM_CDF4(15347, 22430, 26332) }, + { AOM_CDF4(9614, 16736, 21332) }, + { AOM_CDF4(6600, 12275, 16907) }, + { AOM_CDF4(4811, 9424, 13547) }, + { AOM_CDF4(3748, 7809, 11420) }, + { AOM_CDF4(2254, 4587, 6890) }, + { AOM_CDF4(15196, 20284, 23177) }, + { AOM_CDF4(18317, 25469, 28451) }, + { AOM_CDF4(13918, 21651, 25842) }, + { AOM_CDF4(10052, 17150, 21995) }, + { AOM_CDF4(7499, 13630, 18587) }, + { AOM_CDF4(6158, 11417, 16003) }, + { AOM_CDF4(4014, 7785, 11252) }, + { AOM_CDF4(15048, 21067, 24384) }, + { AOM_CDF4(18202, 25346, 28553) }, + { AOM_CDF4(14302, 22019, 26356) }, + { AOM_CDF4(10839, 18139, 23166) }, + { AOM_CDF4(8715, 15744, 20806) }, + { AOM_CDF4(7536, 13576, 18544) }, + { AOM_CDF4(5413, 10335, 14498) } }, + { { AOM_CDF4(17394, 24501, 27895) }, + { AOM_CDF4(15889, 23420, 27185) }, + { AOM_CDF4(11561, 19133, 23870) }, + { AOM_CDF4(8285, 14812, 19844) }, + { AOM_CDF4(6496, 12043, 16550) }, + { AOM_CDF4(4771, 9574, 13677) }, + { AOM_CDF4(3603, 6830, 10144) }, + { AOM_CDF4(21656, 27704, 30200) }, + { AOM_CDF4(21324, 27915, 30511) }, + { AOM_CDF4(17327, 25336, 28997) }, + { AOM_CDF4(13417, 21381, 26033) }, + { AOM_CDF4(10132, 17425, 22338) }, + { AOM_CDF4(8580, 15016, 19633) }, + { AOM_CDF4(5694, 11477, 16411) }, + { AOM_CDF4(24116, 29780, 31450) }, + { AOM_CDF4(23853, 29695, 31591) }, + { AOM_CDF4(20085, 27614, 30428) }, + { AOM_CDF4(15326, 24335, 28575) }, + { AOM_CDF4(11814, 19472, 24810) }, + { AOM_CDF4(10221, 18611, 24767) }, + { AOM_CDF4(7689, 14558, 20321) } } }, + { { { AOM_CDF4(16214, 22380, 25770) }, + { AOM_CDF4(14213, 21304, 25295) }, + { AOM_CDF4(9213, 15823, 20455) }, + { AOM_CDF4(6395, 11758, 16139) }, + { AOM_CDF4(4779, 9187, 13066) }, + { AOM_CDF4(3821, 7501, 10953) }, + { AOM_CDF4(2293, 4567, 6795) }, + { AOM_CDF4(15859, 21283, 23820) }, + { AOM_CDF4(18404, 25602, 28726) }, + { AOM_CDF4(14325, 21980, 26206) }, + { AOM_CDF4(10669, 17937, 22720) }, + { AOM_CDF4(8297, 14642, 19447) }, + { AOM_CDF4(6746, 12389, 16893) }, + { AOM_CDF4(4324, 8251, 11770) }, + { AOM_CDF4(16532, 21631, 24475) }, + { AOM_CDF4(20667, 27150, 29668) }, + { AOM_CDF4(16728, 24510, 28175) }, + { AOM_CDF4(12861, 20645, 25332) }, + { AOM_CDF4(10076, 17361, 22417) }, + { AOM_CDF4(8395, 14940, 19963) }, + { AOM_CDF4(5731, 10683, 14912) } }, + { { AOM_CDF4(14433, 21155, 24938) }, + { AOM_CDF4(14658, 21716, 25545) }, + { AOM_CDF4(9923, 16824, 21557) }, + { AOM_CDF4(6982, 13052, 17721) }, + { AOM_CDF4(5419, 10503, 15050) }, + { AOM_CDF4(4852, 9162, 13014) }, + { AOM_CDF4(3271, 6395, 9630) }, + { AOM_CDF4(22210, 27833, 30109) }, + { AOM_CDF4(20750, 27368, 29821) }, + { AOM_CDF4(16894, 24828, 28573) }, + { AOM_CDF4(13247, 21276, 25757) }, + { AOM_CDF4(10038, 17265, 22563) }, + { AOM_CDF4(8587, 14947, 20327) }, + { AOM_CDF4(5645, 11371, 15252) }, + { AOM_CDF4(22027, 27526, 29714) }, + { AOM_CDF4(23098, 29146, 31221) }, + { AOM_CDF4(19886, 27341, 30272) }, + { AOM_CDF4(15609, 23747, 28046) }, + { AOM_CDF4(11993, 20065, 24939) }, + { AOM_CDF4(9637, 18267, 23671) }, + { AOM_CDF4(7625, 13801, 19144) } } }, + { { { AOM_CDF4(14438, 20798, 24089) }, + { AOM_CDF4(12621, 19203, 23097) }, + { AOM_CDF4(8177, 14125, 18402) }, + { AOM_CDF4(5674, 10501, 14456) }, + { AOM_CDF4(4236, 8239, 11733) }, + { AOM_CDF4(3447, 6750, 9806) }, + { AOM_CDF4(1986, 3950, 5864) }, + { AOM_CDF4(16208, 22099, 24930) }, + { AOM_CDF4(16537, 24025, 27585) }, + { AOM_CDF4(12780, 20381, 24867) }, + { AOM_CDF4(9767, 16612, 21416) }, + { AOM_CDF4(7686, 13738, 18398) }, + { AOM_CDF4(6333, 11614, 15964) }, + { AOM_CDF4(3941, 7571, 10836) }, + { AOM_CDF4(22819, 27422, 29202) }, + { AOM_CDF4(22224, 28514, 30721) }, + { AOM_CDF4(17660, 25433, 28913) }, + { AOM_CDF4(13574, 21482, 26002) }, + { AOM_CDF4(10629, 17977, 22938) }, + { AOM_CDF4(8612, 15298, 20265) }, + { AOM_CDF4(5607, 10491, 14596) } }, + { { AOM_CDF4(13569, 19800, 23206) }, + { AOM_CDF4(13128, 19924, 23869) }, + { AOM_CDF4(8329, 14841, 19403) }, + { AOM_CDF4(6130, 10976, 15057) }, + { AOM_CDF4(4682, 8839, 12518) }, + { AOM_CDF4(3656, 7409, 10588) }, + { AOM_CDF4(2577, 5099, 7412) }, + { AOM_CDF4(22427, 28684, 30585) }, + { AOM_CDF4(20913, 27750, 30139) }, + { AOM_CDF4(15840, 24109, 27834) }, + { AOM_CDF4(12308, 20029, 24569) }, + { AOM_CDF4(10216, 16785, 21458) }, + { AOM_CDF4(8309, 14203, 19113) }, + { AOM_CDF4(6043, 11168, 15307) }, + { AOM_CDF4(23166, 28901, 30998) }, + { AOM_CDF4(21899, 28405, 30751) }, + { AOM_CDF4(18413, 26091, 29443) }, + { AOM_CDF4(15233, 23114, 27352) }, + { AOM_CDF4(12683, 20472, 25288) }, + { AOM_CDF4(10702, 18259, 23409) }, + { AOM_CDF4(8125, 14464, 19226) } } }, + { { { AOM_CDF4(9040, 14786, 18360) }, + { AOM_CDF4(9979, 15718, 19415) }, + { AOM_CDF4(7913, 13918, 18311) }, + { AOM_CDF4(5859, 10889, 15184) }, + { AOM_CDF4(4593, 8677, 12510) }, + { AOM_CDF4(3820, 7396, 10791) }, + { AOM_CDF4(1730, 3471, 5192) }, + { AOM_CDF4(11803, 18365, 22709) }, + { AOM_CDF4(11419, 18058, 22225) }, + { AOM_CDF4(9418, 15774, 20243) }, + { AOM_CDF4(7539, 13325, 17657) }, + { AOM_CDF4(6233, 11317, 15384) }, + { AOM_CDF4(5137, 9656, 13545) }, + { AOM_CDF4(2977, 5774, 8349) }, + { AOM_CDF4(21207, 27246, 29640) }, + { AOM_CDF4(19547, 26578, 29497) }, + { AOM_CDF4(16169, 23871, 27690) }, + { AOM_CDF4(12820, 20458, 25018) }, + { AOM_CDF4(10224, 17332, 22214) }, + { AOM_CDF4(8526, 15048, 19884) }, + { AOM_CDF4(5037, 9410, 13118) } }, + { { AOM_CDF4(12339, 17329, 20140) }, + { AOM_CDF4(13505, 19895, 23225) }, + { AOM_CDF4(9847, 16944, 21564) }, + { AOM_CDF4(7280, 13256, 18348) }, + { AOM_CDF4(4712, 10009, 14454) }, + { AOM_CDF4(4361, 7914, 12477) }, + { AOM_CDF4(2870, 5628, 7995) }, + { AOM_CDF4(20061, 25504, 28526) }, + { AOM_CDF4(15235, 22878, 26145) }, + { AOM_CDF4(12985, 19958, 24155) }, + { AOM_CDF4(9782, 16641, 21403) }, + { AOM_CDF4(9456, 16360, 20760) }, + { AOM_CDF4(6855, 12940, 18557) }, + { AOM_CDF4(5661, 10564, 15002) }, + { AOM_CDF4(25656, 30602, 31894) }, + { AOM_CDF4(22570, 29107, 31092) }, + { AOM_CDF4(18917, 26423, 29541) }, + { AOM_CDF4(15940, 23649, 27754) }, + { AOM_CDF4(12803, 20581, 25219) }, + { AOM_CDF4(11082, 18695, 23376) }, + { AOM_CDF4(7939, 14373, 19005) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(18315, 24289, 27551) }, + { AOM_CDF4(16854, 24068, 27835) }, + { AOM_CDF4(10140, 17927, 23173) }, + { AOM_CDF4(6722, 12982, 18267) }, + { AOM_CDF4(4661, 9826, 14706) }, + { AOM_CDF4(3832, 8165, 12294) }, + { AOM_CDF4(2795, 6098, 9245) }, + { AOM_CDF4(17145, 23326, 26672) }, + { AOM_CDF4(20733, 27680, 30308) }, + { AOM_CDF4(16032, 24461, 28546) }, + { AOM_CDF4(11653, 20093, 25081) }, + { AOM_CDF4(9290, 16429, 22086) }, + { AOM_CDF4(7796, 14598, 19982) }, + { AOM_CDF4(6502, 12378, 17441) }, + { AOM_CDF4(21681, 27732, 30320) }, + { AOM_CDF4(22389, 29044, 31261) }, + { AOM_CDF4(19027, 26731, 30087) }, + { AOM_CDF4(14739, 23755, 28624) }, + { AOM_CDF4(11358, 20778, 25511) }, + { AOM_CDF4(10995, 18073, 24190) }, + { AOM_CDF4(9162, 14990, 20617) } }, + { { AOM_CDF4(21425, 27952, 30388) }, + { AOM_CDF4(18062, 25838, 29034) }, + { AOM_CDF4(11956, 19881, 24808) }, + { AOM_CDF4(7718, 15000, 20980) }, + { AOM_CDF4(5702, 11254, 16143) }, + { AOM_CDF4(4898, 9088, 16864) }, + { AOM_CDF4(3679, 6776, 11907) }, + { AOM_CDF4(23294, 30160, 31663) }, + { AOM_CDF4(24397, 29896, 31836) }, + { AOM_CDF4(19245, 27128, 30593) }, + { AOM_CDF4(13202, 19825, 26404) }, + { AOM_CDF4(11578, 19297, 23957) }, + { AOM_CDF4(8073, 13297, 21370) }, + { AOM_CDF4(5461, 10923, 19745) }, + { AOM_CDF4(27367, 30521, 31934) }, + { AOM_CDF4(24904, 30671, 31940) }, + { AOM_CDF4(23075, 28460, 31299) }, + { AOM_CDF4(14400, 23658, 30417) }, + { AOM_CDF4(13885, 23882, 28325) }, + { AOM_CDF4(14746, 22938, 27853) }, + { AOM_CDF4(5461, 16384, 27307) } } }, + { { { AOM_CDF4(18274, 24813, 27890) }, + { AOM_CDF4(15537, 23149, 27003) }, + { AOM_CDF4(9449, 16740, 21827) }, + { AOM_CDF4(6700, 12498, 17261) }, + { AOM_CDF4(4988, 9866, 14198) }, + { AOM_CDF4(4236, 8147, 11902) }, + { AOM_CDF4(2867, 5860, 8654) }, + { AOM_CDF4(17124, 23171, 26101) }, + { AOM_CDF4(20396, 27477, 30148) }, + { AOM_CDF4(16573, 24629, 28492) }, + { AOM_CDF4(12749, 20846, 25674) }, + { AOM_CDF4(10233, 17878, 22818) }, + { AOM_CDF4(8525, 15332, 20363) }, + { AOM_CDF4(6283, 11632, 16255) }, + { AOM_CDF4(20466, 26511, 29286) }, + { AOM_CDF4(23059, 29174, 31191) }, + { AOM_CDF4(19481, 27263, 30241) }, + { AOM_CDF4(15458, 23631, 28137) }, + { AOM_CDF4(12416, 20608, 25693) }, + { AOM_CDF4(10261, 18011, 23261) }, + { AOM_CDF4(8016, 14655, 19666) } }, + { { AOM_CDF4(17616, 24586, 28112) }, + { AOM_CDF4(15809, 23299, 27155) }, + { AOM_CDF4(10767, 18890, 23793) }, + { AOM_CDF4(7727, 14255, 18865) }, + { AOM_CDF4(6129, 11926, 16882) }, + { AOM_CDF4(4482, 9704, 14861) }, + { AOM_CDF4(3277, 7452, 11522) }, + { AOM_CDF4(22956, 28551, 30730) }, + { AOM_CDF4(22724, 28937, 30961) }, + { AOM_CDF4(18467, 26324, 29580) }, + { AOM_CDF4(13234, 20713, 25649) }, + { AOM_CDF4(11181, 17592, 22481) }, + { AOM_CDF4(8291, 18358, 24576) }, + { AOM_CDF4(7568, 11881, 14984) }, + { AOM_CDF4(24948, 29001, 31147) }, + { AOM_CDF4(25674, 30619, 32151) }, + { AOM_CDF4(20841, 26793, 29603) }, + { AOM_CDF4(14669, 24356, 28666) }, + { AOM_CDF4(11334, 23593, 28219) }, + { AOM_CDF4(8922, 14762, 22873) }, + { AOM_CDF4(8301, 13544, 20535) } } }, + { { { AOM_CDF4(17113, 23733, 27081) }, + { AOM_CDF4(14139, 21406, 25452) }, + { AOM_CDF4(8552, 15002, 19776) }, + { AOM_CDF4(5871, 11120, 15378) }, + { AOM_CDF4(4455, 8616, 12253) }, + { AOM_CDF4(3469, 6910, 10386) }, + { AOM_CDF4(2255, 4553, 6782) }, + { AOM_CDF4(18224, 24376, 27053) }, + { AOM_CDF4(19290, 26710, 29614) }, + { AOM_CDF4(14936, 22991, 27184) }, + { AOM_CDF4(11238, 18951, 23762) }, + { AOM_CDF4(8786, 15617, 20588) }, + { AOM_CDF4(7317, 13228, 18003) }, + { AOM_CDF4(5101, 9512, 13493) }, + { AOM_CDF4(22639, 28222, 30210) }, + { AOM_CDF4(23216, 29331, 31307) }, + { AOM_CDF4(19075, 26762, 29895) }, + { AOM_CDF4(15014, 23113, 27457) }, + { AOM_CDF4(11938, 19857, 24752) }, + { AOM_CDF4(9942, 17280, 22282) }, + { AOM_CDF4(7167, 13144, 17752) } }, + { { AOM_CDF4(15820, 22738, 26488) }, + { AOM_CDF4(13530, 20885, 25216) }, + { AOM_CDF4(8395, 15530, 20452) }, + { AOM_CDF4(6574, 12321, 16380) }, + { AOM_CDF4(5353, 10419, 14568) }, + { AOM_CDF4(4613, 8446, 12381) }, + { AOM_CDF4(3440, 7158, 9903) }, + { AOM_CDF4(24247, 29051, 31224) }, + { AOM_CDF4(22118, 28058, 30369) }, + { AOM_CDF4(16498, 24768, 28389) }, + { AOM_CDF4(12920, 21175, 26137) }, + { AOM_CDF4(10730, 18619, 25352) }, + { AOM_CDF4(10187, 16279, 22791) }, + { AOM_CDF4(9310, 14631, 22127) }, + { AOM_CDF4(24970, 30558, 32057) }, + { AOM_CDF4(24801, 29942, 31698) }, + { AOM_CDF4(22432, 28453, 30855) }, + { AOM_CDF4(19054, 25680, 29580) }, + { AOM_CDF4(14392, 23036, 28109) }, + { AOM_CDF4(12495, 20947, 26650) }, + { AOM_CDF4(12442, 20326, 26214) } } }, + { { { AOM_CDF4(12162, 18785, 22648) }, + { AOM_CDF4(12749, 19697, 23806) }, + { AOM_CDF4(8580, 15297, 20346) }, + { AOM_CDF4(6169, 11749, 16543) }, + { AOM_CDF4(4836, 9391, 13448) }, + { AOM_CDF4(3821, 7711, 11613) }, + { AOM_CDF4(2228, 4601, 7070) }, + { AOM_CDF4(16319, 24725, 28280) }, + { AOM_CDF4(15698, 23277, 27168) }, + { AOM_CDF4(12726, 20368, 25047) }, + { AOM_CDF4(9912, 17015, 21976) }, + { AOM_CDF4(7888, 14220, 19179) }, + { AOM_CDF4(6777, 12284, 17018) }, + { AOM_CDF4(4492, 8590, 12252) }, + { AOM_CDF4(23249, 28904, 30947) }, + { AOM_CDF4(21050, 27908, 30512) }, + { AOM_CDF4(17440, 25340, 28949) }, + { AOM_CDF4(14059, 22018, 26541) }, + { AOM_CDF4(11288, 18903, 23898) }, + { AOM_CDF4(9411, 16342, 21428) }, + { AOM_CDF4(6278, 11588, 15944) } }, + { { AOM_CDF4(13981, 20067, 23226) }, + { AOM_CDF4(16922, 23580, 26783) }, + { AOM_CDF4(11005, 19039, 24487) }, + { AOM_CDF4(7389, 14218, 19798) }, + { AOM_CDF4(5598, 11505, 17206) }, + { AOM_CDF4(6090, 11213, 15659) }, + { AOM_CDF4(3820, 7371, 10119) }, + { AOM_CDF4(21082, 26925, 29675) }, + { AOM_CDF4(21262, 28627, 31128) }, + { AOM_CDF4(18392, 26454, 30437) }, + { AOM_CDF4(14870, 22910, 27096) }, + { AOM_CDF4(12620, 19484, 24908) }, + { AOM_CDF4(9290, 16553, 22802) }, + { AOM_CDF4(6668, 14288, 20004) }, + { AOM_CDF4(27704, 31055, 31949) }, + { AOM_CDF4(24709, 29978, 31788) }, + { AOM_CDF4(21668, 29264, 31657) }, + { AOM_CDF4(18295, 26968, 30074) }, + { AOM_CDF4(16399, 24422, 29313) }, + { AOM_CDF4(14347, 23026, 28104) }, + { AOM_CDF4(12370, 19806, 24477) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } + }; + +static const aom_cdf_prob av1_default_coeff_base_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] + [CDF_SIZE(NUM_BASE_LEVELS + + 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) }, + { AOM_CDF4(18082, 29741, 31877) }, + { AOM_CDF4(12596, 26124, 30493) }, + { AOM_CDF4(9446, 21118, 27005) }, + { AOM_CDF4(6308, 15141, 21279) }, + { AOM_CDF4(2463, 6357, 9783) }, + { AOM_CDF4(20667, 30546, 31929) }, + { AOM_CDF4(13043, 26123, 30134) }, + { AOM_CDF4(8151, 18757, 24778) }, + { AOM_CDF4(5255, 12839, 18632) }, + { AOM_CDF4(2820, 7206, 11161) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(15736, 27553, 30604) }, + { AOM_CDF4(11210, 23794, 28787) }, + { AOM_CDF4(5947, 13874, 19701) }, + { AOM_CDF4(4215, 9323, 13891) }, + { AOM_CDF4(2833, 6462, 10059) }, + { AOM_CDF4(19605, 30393, 31582) }, + { AOM_CDF4(13523, 26252, 30248) }, + { AOM_CDF4(8446, 18622, 24512) }, + { AOM_CDF4(3818, 10343, 15974) }, + { AOM_CDF4(1481, 4117, 6796) }, + { AOM_CDF4(22649, 31302, 32190) }, + { AOM_CDF4(14829, 27127, 30449) }, + { AOM_CDF4(8313, 17702, 23304) }, + { AOM_CDF4(3022, 8301, 12786) }, + { AOM_CDF4(1536, 4412, 7184) }, + { AOM_CDF4(22354, 29774, 31372) }, + { AOM_CDF4(14723, 25472, 29214) }, + { AOM_CDF4(6673, 13745, 18662) }, + { AOM_CDF4(2068, 5766, 9322) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6302, 16444, 21761) }, + { AOM_CDF4(23040, 31538, 32475) }, + { AOM_CDF4(15196, 28452, 31496) }, + { AOM_CDF4(10020, 22946, 28514) }, + { AOM_CDF4(6533, 16862, 23501) }, + { AOM_CDF4(3538, 9816, 15076) }, + { AOM_CDF4(24444, 31875, 32525) }, + { AOM_CDF4(15881, 28924, 31635) }, + { AOM_CDF4(9922, 22873, 28466) }, + { AOM_CDF4(6527, 16966, 23691) }, + { AOM_CDF4(4114, 11303, 17220) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20201, 30770, 32209) }, + { AOM_CDF4(14754, 28071, 31258) }, + { AOM_CDF4(8378, 20186, 26517) }, + { AOM_CDF4(5916, 15299, 21978) }, + { AOM_CDF4(4268, 11583, 17901) }, + { AOM_CDF4(24361, 32025, 32581) }, + { AOM_CDF4(18673, 30105, 31943) }, + { AOM_CDF4(10196, 22244, 27576) }, + { AOM_CDF4(5495, 14349, 20417) }, + { AOM_CDF4(2676, 7415, 11498) }, + { AOM_CDF4(24678, 31958, 32585) }, + { AOM_CDF4(18629, 29906, 31831) }, + { AOM_CDF4(9364, 20724, 26315) }, + { AOM_CDF4(4641, 12318, 18094) }, + { AOM_CDF4(2758, 7387, 11579) }, + { AOM_CDF4(25433, 31842, 32469) }, + { AOM_CDF4(18795, 29289, 31411) }, + { AOM_CDF4(7644, 17584, 23592) }, + { AOM_CDF4(3408, 9014, 15047) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4536, 10072, 14001) }, + { AOM_CDF4(25459, 31416, 32206) }, + { AOM_CDF4(16605, 28048, 30818) }, + { AOM_CDF4(11008, 22857, 27719) }, + { AOM_CDF4(6915, 16268, 22315) }, + { AOM_CDF4(2625, 6812, 10537) }, + { AOM_CDF4(24257, 31788, 32499) }, + { AOM_CDF4(16880, 29454, 31879) }, + { AOM_CDF4(11958, 25054, 29778) }, + { AOM_CDF4(7916, 18718, 25084) }, + { AOM_CDF4(3383, 8777, 13446) }, + { AOM_CDF4(22720, 31603, 32393) }, + { AOM_CDF4(14960, 28125, 31335) }, + { AOM_CDF4(9731, 22210, 27928) }, + { AOM_CDF4(6304, 15832, 22277) }, + { AOM_CDF4(2910, 7818, 12166) }, + { AOM_CDF4(20375, 30627, 32131) }, + { AOM_CDF4(13904, 27284, 30887) }, + { AOM_CDF4(9368, 21558, 27144) }, + { AOM_CDF4(5937, 14966, 21119) }, + { AOM_CDF4(2667, 7225, 11319) }, + { AOM_CDF4(23970, 31470, 32378) }, + { AOM_CDF4(17173, 29734, 32018) }, + { AOM_CDF4(12795, 25441, 29965) }, + { AOM_CDF4(8981, 19680, 25893) }, + { AOM_CDF4(4728, 11372, 16902) }, + { AOM_CDF4(24287, 31797, 32439) }, + { AOM_CDF4(16703, 29145, 31696) }, + { AOM_CDF4(10833, 23554, 28725) }, + { AOM_CDF4(6468, 16566, 23057) }, + { AOM_CDF4(2415, 6562, 10278) }, + { AOM_CDF4(26610, 32395, 32659) }, + { AOM_CDF4(18590, 30498, 32117) }, + { AOM_CDF4(12420, 25756, 29950) }, + { AOM_CDF4(7639, 18746, 24710) }, + { AOM_CDF4(3001, 8086, 12347) }, + { AOM_CDF4(25076, 32064, 32580) }, + { AOM_CDF4(17946, 30128, 32028) }, + { AOM_CDF4(12024, 24985, 29378) }, + { AOM_CDF4(7517, 18390, 24304) }, + { AOM_CDF4(3243, 8781, 13331) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6037, 16771, 21957) }, + { AOM_CDF4(24774, 31704, 32426) }, + { AOM_CDF4(16830, 28589, 31056) }, + { AOM_CDF4(10602, 22828, 27760) }, + { AOM_CDF4(6733, 16829, 23071) }, + { AOM_CDF4(3250, 8914, 13556) }, + { AOM_CDF4(25582, 32220, 32668) }, + { AOM_CDF4(18659, 30342, 32223) }, + { AOM_CDF4(12546, 26149, 30515) }, + { AOM_CDF4(8420, 20451, 26801) }, + { AOM_CDF4(4636, 12420, 18344) }, + { AOM_CDF4(27581, 32362, 32639) }, + { AOM_CDF4(18987, 30083, 31978) }, + { AOM_CDF4(11327, 24248, 29084) }, + { AOM_CDF4(7264, 17719, 24120) }, + { AOM_CDF4(3995, 10768, 16169) }, + { AOM_CDF4(25893, 31831, 32487) }, + { AOM_CDF4(16577, 28587, 31379) }, + { AOM_CDF4(10189, 22748, 28182) }, + { AOM_CDF4(6832, 17094, 23556) }, + { AOM_CDF4(3708, 10110, 15334) }, + { AOM_CDF4(25904, 32282, 32656) }, + { AOM_CDF4(19721, 30792, 32276) }, + { AOM_CDF4(12819, 26243, 30411) }, + { AOM_CDF4(8572, 20614, 26891) }, + { AOM_CDF4(5364, 14059, 20467) }, + { AOM_CDF4(26580, 32438, 32677) }, + { AOM_CDF4(20852, 31225, 32340) }, + { AOM_CDF4(12435, 25700, 29967) }, + { AOM_CDF4(8691, 20825, 26976) }, + { AOM_CDF4(4446, 12209, 17269) }, + { AOM_CDF4(27350, 32429, 32696) }, + { AOM_CDF4(21372, 30977, 32272) }, + { AOM_CDF4(12673, 25270, 29853) }, + { AOM_CDF4(9208, 20925, 26640) }, + { AOM_CDF4(5018, 13351, 18732) }, + { AOM_CDF4(27351, 32479, 32713) }, + { AOM_CDF4(21398, 31209, 32387) }, + { AOM_CDF4(12162, 25047, 29842) }, + { AOM_CDF4(7896, 18691, 25319) }, + { AOM_CDF4(4670, 12882, 18881) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5487, 10460, 13708) }, + { AOM_CDF4(21597, 28303, 30674) }, + { AOM_CDF4(11037, 21953, 26476) }, + { AOM_CDF4(8147, 17962, 22952) }, + { AOM_CDF4(5242, 13061, 18532) }, + { AOM_CDF4(1889, 5208, 8182) }, + { AOM_CDF4(26774, 32133, 32590) }, + { AOM_CDF4(17844, 29564, 31767) }, + { AOM_CDF4(11690, 24438, 29171) }, + { AOM_CDF4(7542, 18215, 24459) }, + { AOM_CDF4(2993, 8050, 12319) }, + { AOM_CDF4(28023, 32328, 32591) }, + { AOM_CDF4(18651, 30126, 31954) }, + { AOM_CDF4(12164, 25146, 29589) }, + { AOM_CDF4(7762, 18530, 24771) }, + { AOM_CDF4(3492, 9183, 13920) }, + { AOM_CDF4(27591, 32008, 32491) }, + { AOM_CDF4(17149, 28853, 31510) }, + { AOM_CDF4(11485, 24003, 28860) }, + { AOM_CDF4(7697, 18086, 24210) }, + { AOM_CDF4(3075, 7999, 12218) }, + { AOM_CDF4(28268, 32482, 32654) }, + { AOM_CDF4(19631, 31051, 32404) }, + { AOM_CDF4(13860, 27260, 31020) }, + { AOM_CDF4(9605, 21613, 27594) }, + { AOM_CDF4(4876, 12162, 17908) }, + { AOM_CDF4(27248, 32316, 32576) }, + { AOM_CDF4(18955, 30457, 32075) }, + { AOM_CDF4(11824, 23997, 28795) }, + { AOM_CDF4(7346, 18196, 24647) }, + { AOM_CDF4(3403, 9247, 14111) }, + { AOM_CDF4(29711, 32655, 32735) }, + { AOM_CDF4(21169, 31394, 32417) }, + { AOM_CDF4(13487, 27198, 30957) }, + { AOM_CDF4(8828, 21683, 27614) }, + { AOM_CDF4(4270, 11451, 17038) }, + { AOM_CDF4(28708, 32578, 32731) }, + { AOM_CDF4(20120, 31241, 32482) }, + { AOM_CDF4(13692, 27550, 31321) }, + { AOM_CDF4(9418, 22514, 28439) }, + { AOM_CDF4(4999, 13283, 19462) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5673, 14302, 19711) }, + { AOM_CDF4(26251, 30701, 31834) }, + { AOM_CDF4(12782, 23783, 27803) }, + { AOM_CDF4(9127, 20657, 25808) }, + { AOM_CDF4(6368, 16208, 21462) }, + { AOM_CDF4(2465, 7177, 10822) }, + { AOM_CDF4(29961, 32563, 32719) }, + { AOM_CDF4(18318, 29891, 31949) }, + { AOM_CDF4(11361, 24514, 29357) }, + { AOM_CDF4(7900, 19603, 25607) }, + { AOM_CDF4(4002, 10590, 15546) }, + { AOM_CDF4(29637, 32310, 32595) }, + { AOM_CDF4(18296, 29913, 31809) }, + { AOM_CDF4(10144, 21515, 26871) }, + { AOM_CDF4(5358, 14322, 20394) }, + { AOM_CDF4(3067, 8362, 13346) }, + { AOM_CDF4(28652, 32470, 32676) }, + { AOM_CDF4(17538, 30771, 32209) }, + { AOM_CDF4(13924, 26882, 30494) }, + { AOM_CDF4(10496, 22837, 27869) }, + { AOM_CDF4(7236, 16396, 21621) }, + { AOM_CDF4(30743, 32687, 32746) }, + { AOM_CDF4(23006, 31676, 32489) }, + { AOM_CDF4(14494, 27828, 31120) }, + { AOM_CDF4(10174, 22801, 28352) }, + { AOM_CDF4(6242, 15281, 21043) }, + { AOM_CDF4(25817, 32243, 32720) }, + { AOM_CDF4(18618, 31367, 32325) }, + { AOM_CDF4(13997, 28318, 31878) }, + { AOM_CDF4(12255, 26534, 31383) }, + { AOM_CDF4(9561, 21588, 28450) }, + { AOM_CDF4(28188, 32635, 32724) }, + { AOM_CDF4(22060, 32365, 32728) }, + { AOM_CDF4(18102, 30690, 32528) }, + { AOM_CDF4(14196, 28864, 31999) }, + { AOM_CDF4(12262, 25792, 30865) }, + { AOM_CDF4(24176, 32109, 32628) }, + { AOM_CDF4(18280, 29681, 31963) }, + { AOM_CDF4(10205, 23703, 29664) }, + { AOM_CDF4(7889, 20025, 27676) }, + { AOM_CDF4(6060, 16743, 23970) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5141, 7096, 8260) }, + { AOM_CDF4(27186, 29022, 29789) }, + { AOM_CDF4(6668, 12568, 15682) }, + { AOM_CDF4(2172, 6181, 8638) }, + { AOM_CDF4(1126, 3379, 4531) }, + { AOM_CDF4(443, 1361, 2254) }, + { AOM_CDF4(26083, 31153, 32436) }, + { AOM_CDF4(13486, 24603, 28483) }, + { AOM_CDF4(6508, 14840, 19910) }, + { AOM_CDF4(3386, 8800, 13286) }, + { AOM_CDF4(1530, 4322, 7054) }, + { AOM_CDF4(29639, 32080, 32548) }, + { AOM_CDF4(15897, 27552, 30290) }, + { AOM_CDF4(8588, 20047, 25383) }, + { AOM_CDF4(4889, 13339, 19269) }, + { AOM_CDF4(2240, 6871, 10498) }, + { AOM_CDF4(28165, 32197, 32517) }, + { AOM_CDF4(20735, 30427, 31568) }, + { AOM_CDF4(14325, 24671, 27692) }, + { AOM_CDF4(5119, 12554, 17805) }, + { AOM_CDF4(1810, 5441, 8261) }, + { AOM_CDF4(31212, 32724, 32748) }, + { AOM_CDF4(23352, 31766, 32545) }, + { AOM_CDF4(14669, 27570, 31059) }, + { AOM_CDF4(8492, 20894, 27272) }, + { AOM_CDF4(3644, 10194, 15204) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(2461, 7013, 9371) }, + { AOM_CDF4(24749, 29600, 30986) }, + { AOM_CDF4(9466, 19037, 22417) }, + { AOM_CDF4(3584, 9280, 14400) }, + { AOM_CDF4(1505, 3929, 5433) }, + { AOM_CDF4(677, 1500, 2736) }, + { AOM_CDF4(23987, 30702, 32117) }, + { AOM_CDF4(13554, 24571, 29263) }, + { AOM_CDF4(6211, 14556, 21155) }, + { AOM_CDF4(3135, 10972, 15625) }, + { AOM_CDF4(2435, 7127, 11427) }, + { AOM_CDF4(31300, 32532, 32550) }, + { AOM_CDF4(14757, 30365, 31954) }, + { AOM_CDF4(4405, 11612, 18553) }, + { AOM_CDF4(580, 4132, 7322) }, + { AOM_CDF4(1695, 10169, 14124) }, + { AOM_CDF4(30008, 32282, 32591) }, + { AOM_CDF4(19244, 30108, 31748) }, + { AOM_CDF4(11180, 24158, 29555) }, + { AOM_CDF4(5650, 14972, 19209) }, + { AOM_CDF4(2114, 5109, 8456) }, + { AOM_CDF4(31856, 32716, 32748) }, + { AOM_CDF4(23012, 31664, 32572) }, + { AOM_CDF4(13694, 26656, 30636) }, + { AOM_CDF4(8142, 19508, 26093) }, + { AOM_CDF4(4253, 10955, 16724) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(601, 983, 1311) }, + { AOM_CDF4(18725, 23406, 28087) }, + { AOM_CDF4(5461, 8192, 10923) }, + { AOM_CDF4(3781, 15124, 21425) }, + { AOM_CDF4(2587, 7761, 12072) }, + { AOM_CDF4(106, 458, 810) }, + { AOM_CDF4(22282, 29710, 31894) }, + { AOM_CDF4(8508, 20926, 25984) }, + { AOM_CDF4(3726, 12713, 18083) }, + { AOM_CDF4(1620, 7112, 10893) }, + { AOM_CDF4(729, 2236, 3495) }, + { AOM_CDF4(30163, 32474, 32684) }, + { AOM_CDF4(18304, 30464, 32000) }, + { AOM_CDF4(11443, 26526, 29647) }, + { AOM_CDF4(6007, 15292, 21299) }, + { AOM_CDF4(2234, 6703, 8937) }, + { AOM_CDF4(30954, 32177, 32571) }, + { AOM_CDF4(17363, 29562, 31076) }, + { AOM_CDF4(9686, 22464, 27410) }, + { AOM_CDF4(8192, 16384, 21390) }, + { AOM_CDF4(1755, 8046, 11264) }, + { AOM_CDF4(31168, 32734, 32748) }, + { AOM_CDF4(22486, 31441, 32471) }, + { AOM_CDF4(12833, 25627, 29738) }, + { AOM_CDF4(6980, 17379, 23122) }, + { AOM_CDF4(3111, 8887, 13479) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(6041, 11854, 15927) }, + { AOM_CDF4(20326, 30905, 32251) }, + { AOM_CDF4(14164, 26831, 30725) }, + { AOM_CDF4(9760, 20647, 26585) }, + { AOM_CDF4(6416, 14953, 21219) }, + { AOM_CDF4(2966, 7151, 10891) }, + { AOM_CDF4(23567, 31374, 32254) }, + { AOM_CDF4(14978, 27416, 30946) }, + { AOM_CDF4(9434, 20225, 26254) }, + { AOM_CDF4(6658, 14558, 20535) }, + { AOM_CDF4(3916, 8677, 12989) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(18088, 29545, 31587) }, + { AOM_CDF4(13062, 25843, 30073) }, + { AOM_CDF4(8940, 16827, 22251) }, + { AOM_CDF4(7654, 13220, 17973) }, + { AOM_CDF4(5733, 10316, 14456) }, + { AOM_CDF4(22879, 31388, 32114) }, + { AOM_CDF4(15215, 27993, 30955) }, + { AOM_CDF4(9397, 19445, 24978) }, + { AOM_CDF4(3442, 9813, 15344) }, + { AOM_CDF4(1368, 3936, 6532) }, + { AOM_CDF4(25494, 32033, 32406) }, + { AOM_CDF4(16772, 27963, 30718) }, + { AOM_CDF4(9419, 18165, 23260) }, + { AOM_CDF4(2677, 7501, 11797) }, + { AOM_CDF4(1516, 4344, 7170) }, + { AOM_CDF4(26556, 31454, 32101) }, + { AOM_CDF4(17128, 27035, 30108) }, + { AOM_CDF4(8324, 15344, 20249) }, + { AOM_CDF4(1903, 5696, 9469) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8455, 19003, 24368) }, + { AOM_CDF4(23563, 32021, 32604) }, + { AOM_CDF4(16237, 29446, 31935) }, + { AOM_CDF4(10724, 23999, 29358) }, + { AOM_CDF4(6725, 17528, 24416) }, + { AOM_CDF4(3927, 10927, 16825) }, + { AOM_CDF4(26313, 32288, 32634) }, + { AOM_CDF4(17430, 30095, 32095) }, + { AOM_CDF4(11116, 24606, 29679) }, + { AOM_CDF4(7195, 18384, 25269) }, + { AOM_CDF4(4726, 12852, 19315) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(22822, 31648, 32483) }, + { AOM_CDF4(16724, 29633, 31929) }, + { AOM_CDF4(10261, 23033, 28725) }, + { AOM_CDF4(7029, 17840, 24528) }, + { AOM_CDF4(4867, 13886, 21502) }, + { AOM_CDF4(25298, 31892, 32491) }, + { AOM_CDF4(17809, 29330, 31512) }, + { AOM_CDF4(9668, 21329, 26579) }, + { AOM_CDF4(4774, 12956, 18976) }, + { AOM_CDF4(2322, 7030, 11540) }, + { AOM_CDF4(25472, 31920, 32543) }, + { AOM_CDF4(17957, 29387, 31632) }, + { AOM_CDF4(9196, 20593, 26400) }, + { AOM_CDF4(4680, 12705, 19202) }, + { AOM_CDF4(2917, 8456, 13436) }, + { AOM_CDF4(26471, 32059, 32574) }, + { AOM_CDF4(18458, 29783, 31909) }, + { AOM_CDF4(8400, 19464, 25956) }, + { AOM_CDF4(3812, 10973, 17206) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(6779, 13743, 17678) }, + { AOM_CDF4(24806, 31797, 32457) }, + { AOM_CDF4(17616, 29047, 31372) }, + { AOM_CDF4(11063, 23175, 28003) }, + { AOM_CDF4(6521, 16110, 22324) }, + { AOM_CDF4(2764, 7504, 11654) }, + { AOM_CDF4(25266, 32367, 32637) }, + { AOM_CDF4(19054, 30553, 32175) }, + { AOM_CDF4(12139, 25212, 29807) }, + { AOM_CDF4(7311, 18162, 24704) }, + { AOM_CDF4(3397, 9164, 14074) }, + { AOM_CDF4(25988, 32208, 32522) }, + { AOM_CDF4(16253, 28912, 31526) }, + { AOM_CDF4(9151, 21387, 27372) }, + { AOM_CDF4(5688, 14915, 21496) }, + { AOM_CDF4(2717, 7627, 12004) }, + { AOM_CDF4(23144, 31855, 32443) }, + { AOM_CDF4(16070, 28491, 31325) }, + { AOM_CDF4(8702, 20467, 26517) }, + { AOM_CDF4(5243, 13956, 20367) }, + { AOM_CDF4(2621, 7335, 11567) }, + { AOM_CDF4(26636, 32340, 32630) }, + { AOM_CDF4(19990, 31050, 32341) }, + { AOM_CDF4(13243, 26105, 30315) }, + { AOM_CDF4(8588, 19521, 25918) }, + { AOM_CDF4(4717, 11585, 17304) }, + { AOM_CDF4(25844, 32292, 32582) }, + { AOM_CDF4(19090, 30635, 32097) }, + { AOM_CDF4(11963, 24546, 28939) }, + { AOM_CDF4(6218, 16087, 22354) }, + { AOM_CDF4(2340, 6608, 10426) }, + { AOM_CDF4(28046, 32576, 32694) }, + { AOM_CDF4(21178, 31313, 32296) }, + { AOM_CDF4(13486, 26184, 29870) }, + { AOM_CDF4(7149, 17871, 23723) }, + { AOM_CDF4(2833, 7958, 12259) }, + { AOM_CDF4(27710, 32528, 32686) }, + { AOM_CDF4(20674, 31076, 32268) }, + { AOM_CDF4(12413, 24955, 29243) }, + { AOM_CDF4(6676, 16927, 23097) }, + { AOM_CDF4(2966, 8333, 12919) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8639, 19339, 24429) }, + { AOM_CDF4(24404, 31837, 32525) }, + { AOM_CDF4(16997, 29425, 31784) }, + { AOM_CDF4(11253, 24234, 29149) }, + { AOM_CDF4(6751, 17394, 24028) }, + { AOM_CDF4(3490, 9830, 15191) }, + { AOM_CDF4(26283, 32471, 32714) }, + { AOM_CDF4(19599, 31168, 32442) }, + { AOM_CDF4(13146, 26954, 30893) }, + { AOM_CDF4(8214, 20588, 26890) }, + { AOM_CDF4(4699, 13081, 19300) }, + { AOM_CDF4(28212, 32458, 32669) }, + { AOM_CDF4(18594, 30316, 32100) }, + { AOM_CDF4(11219, 24408, 29234) }, + { AOM_CDF4(6865, 17656, 24149) }, + { AOM_CDF4(3678, 10362, 16006) }, + { AOM_CDF4(25825, 32136, 32616) }, + { AOM_CDF4(17313, 29853, 32021) }, + { AOM_CDF4(11197, 24471, 29472) }, + { AOM_CDF4(6947, 17781, 24405) }, + { AOM_CDF4(3768, 10660, 16261) }, + { AOM_CDF4(27352, 32500, 32706) }, + { AOM_CDF4(20850, 31468, 32469) }, + { AOM_CDF4(14021, 27707, 31133) }, + { AOM_CDF4(8964, 21748, 27838) }, + { AOM_CDF4(5437, 14665, 21187) }, + { AOM_CDF4(26304, 32492, 32698) }, + { AOM_CDF4(20409, 31380, 32385) }, + { AOM_CDF4(13682, 27222, 30632) }, + { AOM_CDF4(8974, 21236, 26685) }, + { AOM_CDF4(4234, 11665, 16934) }, + { AOM_CDF4(26273, 32357, 32711) }, + { AOM_CDF4(20672, 31242, 32441) }, + { AOM_CDF4(14172, 27254, 30902) }, + { AOM_CDF4(9870, 21898, 27275) }, + { AOM_CDF4(5164, 13506, 19270) }, + { AOM_CDF4(26725, 32459, 32728) }, + { AOM_CDF4(20991, 31442, 32527) }, + { AOM_CDF4(13071, 26434, 30811) }, + { AOM_CDF4(8184, 20090, 26742) }, + { AOM_CDF4(4803, 13255, 19895) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7555, 14942, 18501) }, + { AOM_CDF4(24410, 31178, 32287) }, + { AOM_CDF4(14394, 26738, 30253) }, + { AOM_CDF4(8413, 19554, 25195) }, + { AOM_CDF4(4766, 12924, 18785) }, + { AOM_CDF4(2029, 5806, 9207) }, + { AOM_CDF4(26776, 32364, 32663) }, + { AOM_CDF4(18732, 29967, 31931) }, + { AOM_CDF4(11005, 23786, 28852) }, + { AOM_CDF4(6466, 16909, 23510) }, + { AOM_CDF4(3044, 8638, 13419) }, + { AOM_CDF4(29208, 32582, 32704) }, + { AOM_CDF4(20068, 30857, 32208) }, + { AOM_CDF4(12003, 25085, 29595) }, + { AOM_CDF4(6947, 17750, 24189) }, + { AOM_CDF4(3245, 9103, 14007) }, + { AOM_CDF4(27359, 32465, 32669) }, + { AOM_CDF4(19421, 30614, 32174) }, + { AOM_CDF4(11915, 25010, 29579) }, + { AOM_CDF4(6950, 17676, 24074) }, + { AOM_CDF4(3007, 8473, 13096) }, + { AOM_CDF4(29002, 32676, 32735) }, + { AOM_CDF4(22102, 31849, 32576) }, + { AOM_CDF4(14408, 28009, 31405) }, + { AOM_CDF4(9027, 21679, 27931) }, + { AOM_CDF4(4694, 12678, 18748) }, + { AOM_CDF4(28216, 32528, 32682) }, + { AOM_CDF4(20849, 31264, 32318) }, + { AOM_CDF4(12756, 25815, 29751) }, + { AOM_CDF4(7565, 18801, 24923) }, + { AOM_CDF4(3509, 9533, 14477) }, + { AOM_CDF4(30133, 32687, 32739) }, + { AOM_CDF4(23063, 31910, 32515) }, + { AOM_CDF4(14588, 28051, 31132) }, + { AOM_CDF4(9085, 21649, 27457) }, + { AOM_CDF4(4261, 11654, 17264) }, + { AOM_CDF4(29518, 32691, 32748) }, + { AOM_CDF4(22451, 31959, 32613) }, + { AOM_CDF4(14864, 28722, 31700) }, + { AOM_CDF4(9695, 22964, 28716) }, + { AOM_CDF4(4932, 13358, 19502) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6465, 16958, 21688) }, + { AOM_CDF4(25199, 31514, 32360) }, + { AOM_CDF4(14774, 27149, 30607) }, + { AOM_CDF4(9257, 21438, 26972) }, + { AOM_CDF4(5723, 15183, 21882) }, + { AOM_CDF4(3150, 8879, 13731) }, + { AOM_CDF4(26989, 32262, 32682) }, + { AOM_CDF4(17396, 29937, 32085) }, + { AOM_CDF4(11387, 24901, 29784) }, + { AOM_CDF4(7289, 18821, 25548) }, + { AOM_CDF4(3734, 10577, 16086) }, + { AOM_CDF4(29728, 32501, 32695) }, + { AOM_CDF4(17431, 29701, 31903) }, + { AOM_CDF4(9921, 22826, 28300) }, + { AOM_CDF4(5896, 15434, 22068) }, + { AOM_CDF4(3430, 9646, 14757) }, + { AOM_CDF4(28614, 32511, 32705) }, + { AOM_CDF4(19364, 30638, 32263) }, + { AOM_CDF4(13129, 26254, 30402) }, + { AOM_CDF4(8754, 20484, 26440) }, + { AOM_CDF4(4378, 11607, 17110) }, + { AOM_CDF4(30292, 32671, 32744) }, + { AOM_CDF4(21780, 31603, 32501) }, + { AOM_CDF4(14314, 27829, 31291) }, + { AOM_CDF4(9611, 22327, 28263) }, + { AOM_CDF4(4890, 13087, 19065) }, + { AOM_CDF4(25862, 32567, 32733) }, + { AOM_CDF4(20794, 32050, 32567) }, + { AOM_CDF4(17243, 30625, 32254) }, + { AOM_CDF4(13283, 27628, 31474) }, + { AOM_CDF4(9669, 22532, 28918) }, + { AOM_CDF4(27435, 32697, 32748) }, + { AOM_CDF4(24922, 32390, 32714) }, + { AOM_CDF4(21449, 31504, 32536) }, + { AOM_CDF4(16392, 29729, 31832) }, + { AOM_CDF4(11692, 24884, 29076) }, + { AOM_CDF4(24193, 32290, 32735) }, + { AOM_CDF4(18909, 31104, 32563) }, + { AOM_CDF4(12236, 26841, 31403) }, + { AOM_CDF4(8171, 21840, 29082) }, + { AOM_CDF4(7224, 17280, 25275) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(3078, 6839, 9890) }, + { AOM_CDF4(13837, 20450, 24479) }, + { AOM_CDF4(5914, 14222, 19328) }, + { AOM_CDF4(3866, 10267, 14762) }, + { AOM_CDF4(2612, 7208, 11042) }, + { AOM_CDF4(1067, 2991, 4776) }, + { AOM_CDF4(25817, 31646, 32529) }, + { AOM_CDF4(13708, 26338, 30385) }, + { AOM_CDF4(7328, 18585, 24870) }, + { AOM_CDF4(4691, 13080, 19276) }, + { AOM_CDF4(1825, 5253, 8352) }, + { AOM_CDF4(29386, 32315, 32624) }, + { AOM_CDF4(17160, 29001, 31360) }, + { AOM_CDF4(9602, 21862, 27396) }, + { AOM_CDF4(5915, 15772, 22148) }, + { AOM_CDF4(2786, 7779, 12047) }, + { AOM_CDF4(29246, 32450, 32663) }, + { AOM_CDF4(18696, 29929, 31818) }, + { AOM_CDF4(10510, 23369, 28560) }, + { AOM_CDF4(6229, 16499, 23125) }, + { AOM_CDF4(2608, 7448, 11705) }, + { AOM_CDF4(30753, 32710, 32748) }, + { AOM_CDF4(21638, 31487, 32503) }, + { AOM_CDF4(12937, 26854, 30870) }, + { AOM_CDF4(8182, 20596, 26970) }, + { AOM_CDF4(3637, 10269, 15497) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5244, 12150, 16906) }, + { AOM_CDF4(20486, 26858, 29701) }, + { AOM_CDF4(7756, 18317, 23735) }, + { AOM_CDF4(3452, 9256, 13146) }, + { AOM_CDF4(2020, 5206, 8229) }, + { AOM_CDF4(1801, 4993, 7903) }, + { AOM_CDF4(27051, 31858, 32531) }, + { AOM_CDF4(15988, 27531, 30619) }, + { AOM_CDF4(9188, 21484, 26719) }, + { AOM_CDF4(6273, 17186, 23800) }, + { AOM_CDF4(3108, 9355, 14764) }, + { AOM_CDF4(31076, 32520, 32680) }, + { AOM_CDF4(18119, 30037, 31850) }, + { AOM_CDF4(10244, 22969, 27472) }, + { AOM_CDF4(4692, 14077, 19273) }, + { AOM_CDF4(3694, 11677, 17556) }, + { AOM_CDF4(30060, 32581, 32720) }, + { AOM_CDF4(21011, 30775, 32120) }, + { AOM_CDF4(11931, 24820, 29289) }, + { AOM_CDF4(7119, 17662, 24356) }, + { AOM_CDF4(3833, 10706, 16304) }, + { AOM_CDF4(31954, 32731, 32748) }, + { AOM_CDF4(23913, 31724, 32489) }, + { AOM_CDF4(15520, 28060, 31286) }, + { AOM_CDF4(11517, 23008, 28571) }, + { AOM_CDF4(6193, 14508, 20629) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(1035, 2807, 4156) }, + { AOM_CDF4(13162, 18138, 20939) }, + { AOM_CDF4(2696, 6633, 8755) }, + { AOM_CDF4(1373, 4161, 6853) }, + { AOM_CDF4(1099, 2746, 4716) }, + { AOM_CDF4(340, 1021, 1599) }, + { AOM_CDF4(22826, 30419, 32135) }, + { AOM_CDF4(10395, 21762, 26942) }, + { AOM_CDF4(4726, 12407, 17361) }, + { AOM_CDF4(2447, 7080, 10593) }, + { AOM_CDF4(1227, 3717, 6011) }, + { AOM_CDF4(28156, 31424, 31934) }, + { AOM_CDF4(16915, 27754, 30373) }, + { AOM_CDF4(9148, 20990, 26431) }, + { AOM_CDF4(5950, 15515, 21148) }, + { AOM_CDF4(2492, 7327, 11526) }, + { AOM_CDF4(30602, 32477, 32670) }, + { AOM_CDF4(20026, 29955, 31568) }, + { AOM_CDF4(11220, 23628, 28105) }, + { AOM_CDF4(6652, 17019, 22973) }, + { AOM_CDF4(3064, 8536, 13043) }, + { AOM_CDF4(31769, 32724, 32748) }, + { AOM_CDF4(22230, 30887, 32373) }, + { AOM_CDF4(12234, 25079, 29731) }, + { AOM_CDF4(7326, 18816, 25353) }, + { AOM_CDF4(3933, 10907, 16616) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(8896, 16227, 20630) }, + { AOM_CDF4(23629, 31782, 32527) }, + { AOM_CDF4(15173, 27755, 31321) }, + { AOM_CDF4(10158, 21233, 27382) }, + { AOM_CDF4(6420, 14857, 21558) }, + { AOM_CDF4(3269, 8155, 12646) }, + { AOM_CDF4(24835, 32009, 32496) }, + { AOM_CDF4(16509, 28421, 31579) }, + { AOM_CDF4(10957, 21514, 27418) }, + { AOM_CDF4(7881, 15930, 22096) }, + { AOM_CDF4(5388, 10960, 15918) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20745, 30773, 32093) }, + { AOM_CDF4(15200, 27221, 30861) }, + { AOM_CDF4(13032, 20873, 25667) }, + { AOM_CDF4(12285, 18663, 23494) }, + { AOM_CDF4(11563, 17481, 21489) }, + { AOM_CDF4(26260, 31982, 32320) }, + { AOM_CDF4(15397, 28083, 31100) }, + { AOM_CDF4(9742, 19217, 24824) }, + { AOM_CDF4(3261, 9629, 15362) }, + { AOM_CDF4(1480, 4322, 7499) }, + { AOM_CDF4(27599, 32256, 32460) }, + { AOM_CDF4(16857, 27659, 30774) }, + { AOM_CDF4(9551, 18290, 23748) }, + { AOM_CDF4(3052, 8933, 14103) }, + { AOM_CDF4(2021, 5910, 9787) }, + { AOM_CDF4(29005, 32015, 32392) }, + { AOM_CDF4(17677, 27694, 30863) }, + { AOM_CDF4(9204, 17356, 23219) }, + { AOM_CDF4(2403, 7516, 12814) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10808, 22056, 26896) }, + { AOM_CDF4(25739, 32313, 32676) }, + { AOM_CDF4(17288, 30203, 32221) }, + { AOM_CDF4(11359, 24878, 29896) }, + { AOM_CDF4(6949, 17767, 24893) }, + { AOM_CDF4(4287, 11796, 18071) }, + { AOM_CDF4(27880, 32521, 32705) }, + { AOM_CDF4(19038, 31004, 32414) }, + { AOM_CDF4(12564, 26345, 30768) }, + { AOM_CDF4(8269, 19947, 26779) }, + { AOM_CDF4(5674, 14657, 21674) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(25742, 32319, 32671) }, + { AOM_CDF4(19557, 31164, 32454) }, + { AOM_CDF4(13381, 26381, 30755) }, + { AOM_CDF4(10101, 21466, 26722) }, + { AOM_CDF4(9209, 19650, 26825) }, + { AOM_CDF4(27107, 31917, 32432) }, + { AOM_CDF4(18056, 28893, 31203) }, + { AOM_CDF4(10200, 21434, 26764) }, + { AOM_CDF4(4660, 12913, 19502) }, + { AOM_CDF4(2368, 6930, 12504) }, + { AOM_CDF4(26960, 32158, 32613) }, + { AOM_CDF4(18628, 30005, 32031) }, + { AOM_CDF4(10233, 22442, 28232) }, + { AOM_CDF4(5471, 14630, 21516) }, + { AOM_CDF4(3235, 10767, 17109) }, + { AOM_CDF4(27696, 32440, 32692) }, + { AOM_CDF4(20032, 31167, 32438) }, + { AOM_CDF4(8700, 21341, 28442) }, + { AOM_CDF4(5662, 14831, 21795) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9704, 17294, 21132) }, + { AOM_CDF4(26762, 32278, 32633) }, + { AOM_CDF4(18382, 29620, 31819) }, + { AOM_CDF4(10891, 23475, 28723) }, + { AOM_CDF4(6358, 16583, 23309) }, + { AOM_CDF4(3248, 9118, 14141) }, + { AOM_CDF4(27204, 32573, 32699) }, + { AOM_CDF4(19818, 30824, 32329) }, + { AOM_CDF4(11772, 25120, 30041) }, + { AOM_CDF4(6995, 18033, 25039) }, + { AOM_CDF4(3752, 10442, 16098) }, + { AOM_CDF4(27222, 32256, 32559) }, + { AOM_CDF4(15356, 28399, 31475) }, + { AOM_CDF4(8821, 20635, 27057) }, + { AOM_CDF4(5511, 14404, 21239) }, + { AOM_CDF4(2935, 8222, 13051) }, + { AOM_CDF4(24875, 32120, 32529) }, + { AOM_CDF4(15233, 28265, 31445) }, + { AOM_CDF4(8605, 20570, 26932) }, + { AOM_CDF4(5431, 14413, 21196) }, + { AOM_CDF4(2994, 8341, 13223) }, + { AOM_CDF4(28201, 32604, 32700) }, + { AOM_CDF4(21041, 31446, 32456) }, + { AOM_CDF4(13221, 26213, 30475) }, + { AOM_CDF4(8255, 19385, 26037) }, + { AOM_CDF4(4930, 12585, 18830) }, + { AOM_CDF4(28768, 32448, 32627) }, + { AOM_CDF4(19705, 30561, 32021) }, + { AOM_CDF4(11572, 23589, 28220) }, + { AOM_CDF4(5532, 15034, 21446) }, + { AOM_CDF4(2460, 7150, 11456) }, + { AOM_CDF4(29874, 32619, 32699) }, + { AOM_CDF4(21621, 31071, 32201) }, + { AOM_CDF4(12511, 24747, 28992) }, + { AOM_CDF4(6281, 16395, 22748) }, + { AOM_CDF4(3246, 9278, 14497) }, + { AOM_CDF4(29715, 32625, 32712) }, + { AOM_CDF4(20958, 31011, 32283) }, + { AOM_CDF4(11233, 23671, 28806) }, + { AOM_CDF4(6012, 16128, 22868) }, + { AOM_CDF4(3427, 9851, 15414) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11016, 22111, 26794) }, + { AOM_CDF4(25946, 32357, 32677) }, + { AOM_CDF4(17890, 30452, 32252) }, + { AOM_CDF4(11678, 25142, 29816) }, + { AOM_CDF4(6720, 17534, 24584) }, + { AOM_CDF4(4230, 11665, 17820) }, + { AOM_CDF4(28400, 32623, 32747) }, + { AOM_CDF4(21164, 31668, 32575) }, + { AOM_CDF4(13572, 27388, 31182) }, + { AOM_CDF4(8234, 20750, 27358) }, + { AOM_CDF4(5065, 14055, 20897) }, + { AOM_CDF4(28981, 32547, 32705) }, + { AOM_CDF4(18681, 30543, 32239) }, + { AOM_CDF4(10919, 24075, 29286) }, + { AOM_CDF4(6431, 17199, 24077) }, + { AOM_CDF4(3819, 10464, 16618) }, + { AOM_CDF4(26870, 32467, 32693) }, + { AOM_CDF4(19041, 30831, 32347) }, + { AOM_CDF4(11794, 25211, 30016) }, + { AOM_CDF4(6888, 18019, 24970) }, + { AOM_CDF4(4370, 12363, 18992) }, + { AOM_CDF4(29578, 32670, 32744) }, + { AOM_CDF4(23159, 32007, 32613) }, + { AOM_CDF4(15315, 28669, 31676) }, + { AOM_CDF4(9298, 22607, 28782) }, + { AOM_CDF4(6144, 15913, 22968) }, + { AOM_CDF4(28110, 32499, 32669) }, + { AOM_CDF4(21574, 30937, 32015) }, + { AOM_CDF4(12759, 24818, 28727) }, + { AOM_CDF4(6545, 16761, 23042) }, + { AOM_CDF4(3649, 10597, 16833) }, + { AOM_CDF4(28163, 32552, 32728) }, + { AOM_CDF4(22101, 31469, 32464) }, + { AOM_CDF4(13160, 25472, 30143) }, + { AOM_CDF4(7303, 18684, 25468) }, + { AOM_CDF4(5241, 13975, 20955) }, + { AOM_CDF4(28400, 32631, 32744) }, + { AOM_CDF4(22104, 31793, 32603) }, + { AOM_CDF4(13557, 26571, 30846) }, + { AOM_CDF4(7749, 19861, 26675) }, + { AOM_CDF4(4873, 14030, 21234) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9800, 17635, 21073) }, + { AOM_CDF4(26153, 31885, 32527) }, + { AOM_CDF4(15038, 27852, 31006) }, + { AOM_CDF4(8718, 20564, 26486) }, + { AOM_CDF4(5128, 14076, 20514) }, + { AOM_CDF4(2636, 7566, 11925) }, + { AOM_CDF4(27551, 32504, 32701) }, + { AOM_CDF4(18310, 30054, 32100) }, + { AOM_CDF4(10211, 23420, 29082) }, + { AOM_CDF4(6222, 16876, 23916) }, + { AOM_CDF4(3462, 9954, 15498) }, + { AOM_CDF4(29991, 32633, 32721) }, + { AOM_CDF4(19883, 30751, 32201) }, + { AOM_CDF4(11141, 24184, 29285) }, + { AOM_CDF4(6420, 16940, 23774) }, + { AOM_CDF4(3392, 9753, 15118) }, + { AOM_CDF4(28465, 32616, 32712) }, + { AOM_CDF4(19850, 30702, 32244) }, + { AOM_CDF4(10983, 24024, 29223) }, + { AOM_CDF4(6294, 16770, 23582) }, + { AOM_CDF4(3244, 9283, 14509) }, + { AOM_CDF4(30023, 32717, 32748) }, + { AOM_CDF4(22940, 32032, 32626) }, + { AOM_CDF4(14282, 27928, 31473) }, + { AOM_CDF4(8562, 21327, 27914) }, + { AOM_CDF4(4846, 13393, 19919) }, + { AOM_CDF4(29981, 32590, 32695) }, + { AOM_CDF4(20465, 30963, 32166) }, + { AOM_CDF4(11479, 23579, 28195) }, + { AOM_CDF4(5916, 15648, 22073) }, + { AOM_CDF4(3031, 8605, 13398) }, + { AOM_CDF4(31146, 32691, 32739) }, + { AOM_CDF4(23106, 31724, 32444) }, + { AOM_CDF4(13783, 26738, 30439) }, + { AOM_CDF4(7852, 19468, 25807) }, + { AOM_CDF4(3860, 11124, 16853) }, + { AOM_CDF4(31014, 32724, 32748) }, + { AOM_CDF4(23629, 32109, 32628) }, + { AOM_CDF4(14747, 28115, 31403) }, + { AOM_CDF4(8545, 21242, 27478) }, + { AOM_CDF4(4574, 12781, 19067) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9185, 19694, 24688) }, + { AOM_CDF4(26081, 31985, 32621) }, + { AOM_CDF4(16015, 29000, 31787) }, + { AOM_CDF4(10542, 23690, 29206) }, + { AOM_CDF4(6732, 17945, 24677) }, + { AOM_CDF4(3916, 11039, 16722) }, + { AOM_CDF4(28224, 32566, 32744) }, + { AOM_CDF4(19100, 31138, 32485) }, + { AOM_CDF4(12528, 26620, 30879) }, + { AOM_CDF4(7741, 20277, 26885) }, + { AOM_CDF4(4566, 12845, 18990) }, + { AOM_CDF4(29933, 32593, 32718) }, + { AOM_CDF4(17670, 30333, 32155) }, + { AOM_CDF4(10385, 23600, 28909) }, + { AOM_CDF4(6243, 16236, 22407) }, + { AOM_CDF4(3976, 10389, 16017) }, + { AOM_CDF4(28377, 32561, 32738) }, + { AOM_CDF4(19366, 31175, 32482) }, + { AOM_CDF4(13327, 27175, 31094) }, + { AOM_CDF4(8258, 20769, 27143) }, + { AOM_CDF4(4703, 13198, 19527) }, + { AOM_CDF4(31086, 32706, 32748) }, + { AOM_CDF4(22853, 31902, 32583) }, + { AOM_CDF4(14759, 28186, 31419) }, + { AOM_CDF4(9284, 22382, 28348) }, + { AOM_CDF4(5585, 15192, 21868) }, + { AOM_CDF4(28291, 32652, 32746) }, + { AOM_CDF4(19849, 32107, 32571) }, + { AOM_CDF4(14834, 26818, 29214) }, + { AOM_CDF4(10306, 22594, 28672) }, + { AOM_CDF4(6615, 17384, 23384) }, + { AOM_CDF4(28947, 32604, 32745) }, + { AOM_CDF4(25625, 32289, 32646) }, + { AOM_CDF4(18758, 28672, 31403) }, + { AOM_CDF4(10017, 23430, 28523) }, + { AOM_CDF4(6862, 15269, 22131) }, + { AOM_CDF4(23933, 32509, 32739) }, + { AOM_CDF4(19927, 31495, 32631) }, + { AOM_CDF4(11903, 26023, 30621) }, + { AOM_CDF4(7026, 20094, 27252) }, + { AOM_CDF4(5998, 18106, 24437) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4456, 11274, 15533) }, + { AOM_CDF4(21219, 29079, 31616) }, + { AOM_CDF4(11173, 23774, 28567) }, + { AOM_CDF4(7282, 18293, 24263) }, + { AOM_CDF4(4890, 13286, 19115) }, + { AOM_CDF4(1890, 5508, 8659) }, + { AOM_CDF4(26651, 32136, 32647) }, + { AOM_CDF4(14630, 28254, 31455) }, + { AOM_CDF4(8716, 21287, 27395) }, + { AOM_CDF4(5615, 15331, 22008) }, + { AOM_CDF4(2675, 7700, 12150) }, + { AOM_CDF4(29954, 32526, 32690) }, + { AOM_CDF4(16126, 28982, 31633) }, + { AOM_CDF4(9030, 21361, 27352) }, + { AOM_CDF4(5411, 14793, 21271) }, + { AOM_CDF4(2943, 8422, 13163) }, + { AOM_CDF4(29539, 32601, 32730) }, + { AOM_CDF4(18125, 30385, 32201) }, + { AOM_CDF4(10422, 24090, 29468) }, + { AOM_CDF4(6468, 17487, 24438) }, + { AOM_CDF4(2970, 8653, 13531) }, + { AOM_CDF4(30912, 32715, 32748) }, + { AOM_CDF4(20666, 31373, 32497) }, + { AOM_CDF4(12509, 26640, 30917) }, + { AOM_CDF4(8058, 20629, 27290) }, + { AOM_CDF4(4231, 12006, 18052) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10202, 20633, 25484) }, + { AOM_CDF4(27336, 31445, 32352) }, + { AOM_CDF4(12420, 24384, 28552) }, + { AOM_CDF4(7648, 18115, 23856) }, + { AOM_CDF4(5662, 14341, 19902) }, + { AOM_CDF4(3611, 10328, 15390) }, + { AOM_CDF4(30945, 32616, 32736) }, + { AOM_CDF4(18682, 30505, 32253) }, + { AOM_CDF4(11513, 25336, 30203) }, + { AOM_CDF4(7449, 19452, 26148) }, + { AOM_CDF4(4482, 13051, 18886) }, + { AOM_CDF4(32022, 32690, 32747) }, + { AOM_CDF4(18578, 30501, 32146) }, + { AOM_CDF4(11249, 23368, 28631) }, + { AOM_CDF4(5645, 16958, 22158) }, + { AOM_CDF4(5009, 11444, 16637) }, + { AOM_CDF4(31357, 32710, 32748) }, + { AOM_CDF4(21552, 31494, 32504) }, + { AOM_CDF4(13891, 27677, 31340) }, + { AOM_CDF4(9051, 22098, 28172) }, + { AOM_CDF4(5190, 13377, 19486) }, + { AOM_CDF4(32364, 32740, 32748) }, + { AOM_CDF4(24839, 31907, 32551) }, + { AOM_CDF4(17160, 28779, 31696) }, + { AOM_CDF4(12452, 24137, 29602) }, + { AOM_CDF4(6165, 15389, 22477) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(2575, 7281, 11077) }, + { AOM_CDF4(14002, 20866, 25402) }, + { AOM_CDF4(6343, 15056, 19658) }, + { AOM_CDF4(4474, 11858, 17041) }, + { AOM_CDF4(2865, 8299, 12534) }, + { AOM_CDF4(1344, 3949, 6391) }, + { AOM_CDF4(24720, 31239, 32459) }, + { AOM_CDF4(12585, 25356, 29968) }, + { AOM_CDF4(7181, 18246, 24444) }, + { AOM_CDF4(5025, 13667, 19885) }, + { AOM_CDF4(2521, 7304, 11605) }, + { AOM_CDF4(29908, 32252, 32584) }, + { AOM_CDF4(17421, 29156, 31575) }, + { AOM_CDF4(9889, 22188, 27782) }, + { AOM_CDF4(5878, 15647, 22123) }, + { AOM_CDF4(2814, 8665, 13323) }, + { AOM_CDF4(30183, 32568, 32713) }, + { AOM_CDF4(18528, 30195, 32049) }, + { AOM_CDF4(10982, 24606, 29657) }, + { AOM_CDF4(6957, 18165, 25231) }, + { AOM_CDF4(3508, 10118, 15468) }, + { AOM_CDF4(31761, 32736, 32748) }, + { AOM_CDF4(21041, 31328, 32546) }, + { AOM_CDF4(12568, 26732, 31166) }, + { AOM_CDF4(8052, 20720, 27733) }, + { AOM_CDF4(4336, 12192, 18396) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(7062, 16472, 22319) }, + { AOM_CDF4(24538, 32261, 32674) }, + { AOM_CDF4(13675, 28041, 31779) }, + { AOM_CDF4(8590, 20674, 27631) }, + { AOM_CDF4(5685, 14675, 22013) }, + { AOM_CDF4(3655, 9898, 15731) }, + { AOM_CDF4(26493, 32418, 32658) }, + { AOM_CDF4(16376, 29342, 32090) }, + { AOM_CDF4(10594, 22649, 28970) }, + { AOM_CDF4(8176, 17170, 24303) }, + { AOM_CDF4(5605, 12694, 19139) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(23888, 31902, 32542) }, + { AOM_CDF4(18612, 29687, 31987) }, + { AOM_CDF4(16245, 24852, 29249) }, + { AOM_CDF4(15765, 22608, 27559) }, + { AOM_CDF4(19895, 24699, 27510) }, + { AOM_CDF4(28401, 32212, 32457) }, + { AOM_CDF4(15274, 27825, 30980) }, + { AOM_CDF4(9364, 18128, 24332) }, + { AOM_CDF4(2283, 8193, 15082) }, + { AOM_CDF4(1228, 3972, 7881) }, + { AOM_CDF4(29455, 32469, 32620) }, + { AOM_CDF4(17981, 28245, 31388) }, + { AOM_CDF4(10921, 20098, 26240) }, + { AOM_CDF4(3743, 11829, 18657) }, + { AOM_CDF4(2374, 9593, 15715) }, + { AOM_CDF4(31068, 32466, 32635) }, + { AOM_CDF4(20321, 29572, 31971) }, + { AOM_CDF4(10771, 20255, 27119) }, + { AOM_CDF4(2795, 10410, 17361) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9320, 22102, 27840) }, + { AOM_CDF4(27057, 32464, 32724) }, + { AOM_CDF4(16331, 30268, 32309) }, + { AOM_CDF4(10319, 23935, 29720) }, + { AOM_CDF4(6189, 16448, 24106) }, + { AOM_CDF4(3589, 10884, 18808) }, + { AOM_CDF4(29026, 32624, 32748) }, + { AOM_CDF4(19226, 31507, 32587) }, + { AOM_CDF4(12692, 26921, 31203) }, + { AOM_CDF4(7049, 19532, 27635) }, + { AOM_CDF4(7727, 15669, 23252) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(28056, 32625, 32748) }, + { AOM_CDF4(22383, 32075, 32669) }, + { AOM_CDF4(15417, 27098, 31749) }, + { AOM_CDF4(18127, 26493, 27190) }, + { AOM_CDF4(5461, 16384, 21845) }, + { AOM_CDF4(27982, 32091, 32584) }, + { AOM_CDF4(19045, 29868, 31972) }, + { AOM_CDF4(10397, 22266, 27932) }, + { AOM_CDF4(5990, 13697, 21500) }, + { AOM_CDF4(1792, 6912, 15104) }, + { AOM_CDF4(28198, 32501, 32718) }, + { AOM_CDF4(21534, 31521, 32569) }, + { AOM_CDF4(11109, 25217, 30017) }, + { AOM_CDF4(5671, 15124, 26151) }, + { AOM_CDF4(4681, 14043, 18725) }, + { AOM_CDF4(28688, 32580, 32741) }, + { AOM_CDF4(22576, 32079, 32661) }, + { AOM_CDF4(10627, 22141, 28340) }, + { AOM_CDF4(9362, 14043, 28087) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7754, 16948, 22142) }, + { AOM_CDF4(25670, 32330, 32691) }, + { AOM_CDF4(15663, 29225, 31994) }, + { AOM_CDF4(9878, 23288, 29158) }, + { AOM_CDF4(6419, 17088, 24336) }, + { AOM_CDF4(3859, 11003, 17039) }, + { AOM_CDF4(27562, 32595, 32725) }, + { AOM_CDF4(17575, 30588, 32399) }, + { AOM_CDF4(10819, 24838, 30309) }, + { AOM_CDF4(7124, 18686, 25916) }, + { AOM_CDF4(4479, 12688, 19340) }, + { AOM_CDF4(28385, 32476, 32673) }, + { AOM_CDF4(15306, 29005, 31938) }, + { AOM_CDF4(8937, 21615, 28322) }, + { AOM_CDF4(5982, 15603, 22786) }, + { AOM_CDF4(3620, 10267, 16136) }, + { AOM_CDF4(27280, 32464, 32667) }, + { AOM_CDF4(15607, 29160, 32004) }, + { AOM_CDF4(9091, 22135, 28740) }, + { AOM_CDF4(6232, 16632, 24020) }, + { AOM_CDF4(4047, 11377, 17672) }, + { AOM_CDF4(29220, 32630, 32718) }, + { AOM_CDF4(19650, 31220, 32462) }, + { AOM_CDF4(13050, 26312, 30827) }, + { AOM_CDF4(9228, 20870, 27468) }, + { AOM_CDF4(6146, 15149, 21971) }, + { AOM_CDF4(30169, 32481, 32623) }, + { AOM_CDF4(17212, 29311, 31554) }, + { AOM_CDF4(9911, 21311, 26882) }, + { AOM_CDF4(4487, 13314, 20372) }, + { AOM_CDF4(2570, 7772, 12889) }, + { AOM_CDF4(30924, 32613, 32708) }, + { AOM_CDF4(19490, 30206, 32107) }, + { AOM_CDF4(11232, 23998, 29276) }, + { AOM_CDF4(6769, 17955, 25035) }, + { AOM_CDF4(4398, 12623, 19214) }, + { AOM_CDF4(30609, 32627, 32722) }, + { AOM_CDF4(19370, 30582, 32287) }, + { AOM_CDF4(10457, 23619, 29409) }, + { AOM_CDF4(6443, 17637, 24834) }, + { AOM_CDF4(4645, 13236, 20106) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8626, 20271, 26216) }, + { AOM_CDF4(26707, 32406, 32711) }, + { AOM_CDF4(16999, 30329, 32286) }, + { AOM_CDF4(11445, 25123, 30286) }, + { AOM_CDF4(6411, 18828, 25601) }, + { AOM_CDF4(6801, 12458, 20248) }, + { AOM_CDF4(29918, 32682, 32748) }, + { AOM_CDF4(20649, 31739, 32618) }, + { AOM_CDF4(12879, 27773, 31581) }, + { AOM_CDF4(7896, 21751, 28244) }, + { AOM_CDF4(5260, 14870, 23698) }, + { AOM_CDF4(29252, 32593, 32731) }, + { AOM_CDF4(17072, 30460, 32294) }, + { AOM_CDF4(10653, 24143, 29365) }, + { AOM_CDF4(6536, 17490, 23983) }, + { AOM_CDF4(4929, 13170, 20085) }, + { AOM_CDF4(28137, 32518, 32715) }, + { AOM_CDF4(18171, 30784, 32407) }, + { AOM_CDF4(11437, 25436, 30459) }, + { AOM_CDF4(7252, 18534, 26176) }, + { AOM_CDF4(4126, 13353, 20978) }, + { AOM_CDF4(31162, 32726, 32748) }, + { AOM_CDF4(23017, 32222, 32701) }, + { AOM_CDF4(15629, 29233, 32046) }, + { AOM_CDF4(9387, 22621, 29480) }, + { AOM_CDF4(6922, 17616, 25010) }, + { AOM_CDF4(28838, 32265, 32614) }, + { AOM_CDF4(19701, 30206, 31920) }, + { AOM_CDF4(11214, 22410, 27933) }, + { AOM_CDF4(5320, 14177, 23034) }, + { AOM_CDF4(5049, 12881, 17827) }, + { AOM_CDF4(27484, 32471, 32734) }, + { AOM_CDF4(21076, 31526, 32561) }, + { AOM_CDF4(12707, 26303, 31211) }, + { AOM_CDF4(8169, 21722, 28219) }, + { AOM_CDF4(6045, 19406, 27042) }, + { AOM_CDF4(27753, 32572, 32745) }, + { AOM_CDF4(20832, 31878, 32653) }, + { AOM_CDF4(13250, 27356, 31674) }, + { AOM_CDF4(7718, 21508, 29858) }, + { AOM_CDF4(7209, 18350, 25559) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7876, 16901, 21741) }, + { AOM_CDF4(24001, 31898, 32625) }, + { AOM_CDF4(14529, 27959, 31451) }, + { AOM_CDF4(8273, 20818, 27258) }, + { AOM_CDF4(5278, 14673, 21510) }, + { AOM_CDF4(2983, 8843, 14039) }, + { AOM_CDF4(28016, 32574, 32732) }, + { AOM_CDF4(17471, 30306, 32301) }, + { AOM_CDF4(10224, 24063, 29728) }, + { AOM_CDF4(6602, 17954, 25052) }, + { AOM_CDF4(4002, 11585, 17759) }, + { AOM_CDF4(30190, 32634, 32739) }, + { AOM_CDF4(17497, 30282, 32270) }, + { AOM_CDF4(10229, 23729, 29538) }, + { AOM_CDF4(6344, 17211, 24440) }, + { AOM_CDF4(3849, 11189, 17108) }, + { AOM_CDF4(28570, 32583, 32726) }, + { AOM_CDF4(17521, 30161, 32238) }, + { AOM_CDF4(10153, 23565, 29378) }, + { AOM_CDF4(6455, 17341, 24443) }, + { AOM_CDF4(3907, 11042, 17024) }, + { AOM_CDF4(30689, 32715, 32748) }, + { AOM_CDF4(21546, 31840, 32610) }, + { AOM_CDF4(13547, 27581, 31459) }, + { AOM_CDF4(8912, 21757, 28309) }, + { AOM_CDF4(5548, 15080, 22046) }, + { AOM_CDF4(30783, 32540, 32685) }, + { AOM_CDF4(17540, 29528, 31668) }, + { AOM_CDF4(10160, 21468, 26783) }, + { AOM_CDF4(4724, 13393, 20054) }, + { AOM_CDF4(2702, 8174, 13102) }, + { AOM_CDF4(31648, 32686, 32742) }, + { AOM_CDF4(20954, 31094, 32337) }, + { AOM_CDF4(12420, 25698, 30179) }, + { AOM_CDF4(7304, 19320, 26248) }, + { AOM_CDF4(4366, 12261, 18864) }, + { AOM_CDF4(31581, 32723, 32748) }, + { AOM_CDF4(21373, 31586, 32525) }, + { AOM_CDF4(12744, 26625, 30885) }, + { AOM_CDF4(7431, 20322, 26950) }, + { AOM_CDF4(4692, 13323, 20111) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(7833, 18369, 24095) }, + { AOM_CDF4(26650, 32273, 32702) }, + { AOM_CDF4(16371, 29961, 32191) }, + { AOM_CDF4(11055, 24082, 29629) }, + { AOM_CDF4(6892, 18644, 25400) }, + { AOM_CDF4(5006, 13057, 19240) }, + { AOM_CDF4(29834, 32666, 32748) }, + { AOM_CDF4(19577, 31335, 32570) }, + { AOM_CDF4(12253, 26509, 31122) }, + { AOM_CDF4(7991, 20772, 27711) }, + { AOM_CDF4(5677, 15910, 23059) }, + { AOM_CDF4(30109, 32532, 32720) }, + { AOM_CDF4(16747, 30166, 32252) }, + { AOM_CDF4(10134, 23542, 29184) }, + { AOM_CDF4(5791, 16176, 23556) }, + { AOM_CDF4(4362, 10414, 17284) }, + { AOM_CDF4(29492, 32626, 32748) }, + { AOM_CDF4(19894, 31402, 32525) }, + { AOM_CDF4(12942, 27071, 30869) }, + { AOM_CDF4(8346, 21216, 27405) }, + { AOM_CDF4(6572, 17087, 23859) }, + { AOM_CDF4(32035, 32735, 32748) }, + { AOM_CDF4(22957, 31838, 32618) }, + { AOM_CDF4(14724, 28572, 31772) }, + { AOM_CDF4(10364, 23999, 29553) }, + { AOM_CDF4(7004, 18433, 25655) }, + { AOM_CDF4(27528, 32277, 32681) }, + { AOM_CDF4(16959, 31171, 32096) }, + { AOM_CDF4(10486, 23593, 27962) }, + { AOM_CDF4(8192, 16384, 23211) }, + { AOM_CDF4(8937, 17873, 20852) }, + { AOM_CDF4(27715, 32002, 32615) }, + { AOM_CDF4(15073, 29491, 31676) }, + { AOM_CDF4(11264, 24576, 28672) }, + { AOM_CDF4(2341, 18725, 23406) }, + { AOM_CDF4(7282, 18204, 25486) }, + { AOM_CDF4(28547, 32213, 32657) }, + { AOM_CDF4(20788, 29773, 32239) }, + { AOM_CDF4(6780, 21469, 30508) }, + { AOM_CDF4(5958, 14895, 23831) }, + { AOM_CDF4(16384, 21845, 27307) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5992, 14304, 19765) }, + { AOM_CDF4(22612, 31238, 32456) }, + { AOM_CDF4(13456, 27162, 31087) }, + { AOM_CDF4(8001, 20062, 26504) }, + { AOM_CDF4(5168, 14105, 20764) }, + { AOM_CDF4(2632, 7771, 12385) }, + { AOM_CDF4(27034, 32344, 32709) }, + { AOM_CDF4(15850, 29415, 31997) }, + { AOM_CDF4(9494, 22776, 28841) }, + { AOM_CDF4(6151, 16830, 23969) }, + { AOM_CDF4(3461, 10039, 15722) }, + { AOM_CDF4(30134, 32569, 32731) }, + { AOM_CDF4(15638, 29422, 31945) }, + { AOM_CDF4(9150, 21865, 28218) }, + { AOM_CDF4(5647, 15719, 22676) }, + { AOM_CDF4(3402, 9772, 15477) }, + { AOM_CDF4(28530, 32586, 32735) }, + { AOM_CDF4(17139, 30298, 32292) }, + { AOM_CDF4(10200, 24039, 29685) }, + { AOM_CDF4(6419, 17674, 24786) }, + { AOM_CDF4(3544, 10225, 15824) }, + { AOM_CDF4(31333, 32726, 32748) }, + { AOM_CDF4(20618, 31487, 32544) }, + { AOM_CDF4(12901, 27217, 31232) }, + { AOM_CDF4(8624, 21734, 28171) }, + { AOM_CDF4(5104, 14191, 20748) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11206, 21090, 26561) }, + { AOM_CDF4(28759, 32279, 32671) }, + { AOM_CDF4(14171, 27952, 31569) }, + { AOM_CDF4(9743, 22907, 29141) }, + { AOM_CDF4(6871, 17886, 24868) }, + { AOM_CDF4(4960, 13152, 19315) }, + { AOM_CDF4(31077, 32661, 32748) }, + { AOM_CDF4(19400, 31195, 32515) }, + { AOM_CDF4(12752, 26858, 31040) }, + { AOM_CDF4(8370, 22098, 28591) }, + { AOM_CDF4(5457, 15373, 22298) }, + { AOM_CDF4(31697, 32706, 32748) }, + { AOM_CDF4(17860, 30657, 32333) }, + { AOM_CDF4(12510, 24812, 29261) }, + { AOM_CDF4(6180, 19124, 24722) }, + { AOM_CDF4(5041, 13548, 17959) }, + { AOM_CDF4(31552, 32716, 32748) }, + { AOM_CDF4(21908, 31769, 32623) }, + { AOM_CDF4(14470, 28201, 31565) }, + { AOM_CDF4(9493, 22982, 28608) }, + { AOM_CDF4(6858, 17240, 24137) }, + { AOM_CDF4(32543, 32752, 32756) }, + { AOM_CDF4(24286, 32097, 32666) }, + { AOM_CDF4(15958, 29217, 32024) }, + { AOM_CDF4(10207, 24234, 29958) }, + { AOM_CDF4(6929, 18305, 25652) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4137, 10847, 15682) }, + { AOM_CDF4(17824, 27001, 30058) }, + { AOM_CDF4(10204, 22796, 28291) }, + { AOM_CDF4(6076, 15935, 22125) }, + { AOM_CDF4(3852, 10937, 16816) }, + { AOM_CDF4(2252, 6324, 10131) }, + { AOM_CDF4(25840, 32016, 32662) }, + { AOM_CDF4(15109, 28268, 31531) }, + { AOM_CDF4(9385, 22231, 28340) }, + { AOM_CDF4(6082, 16672, 23479) }, + { AOM_CDF4(3318, 9427, 14681) }, + { AOM_CDF4(30594, 32574, 32718) }, + { AOM_CDF4(16836, 29552, 31859) }, + { AOM_CDF4(9556, 22542, 28356) }, + { AOM_CDF4(6305, 16725, 23540) }, + { AOM_CDF4(3376, 9895, 15184) }, + { AOM_CDF4(29383, 32617, 32745) }, + { AOM_CDF4(18891, 30809, 32401) }, + { AOM_CDF4(11688, 25942, 30687) }, + { AOM_CDF4(7468, 19469, 26651) }, + { AOM_CDF4(3909, 11358, 17012) }, + { AOM_CDF4(31564, 32736, 32748) }, + { AOM_CDF4(20906, 31611, 32600) }, + { AOM_CDF4(13191, 27621, 31537) }, + { AOM_CDF4(8768, 22029, 28676) }, + { AOM_CDF4(5079, 14109, 20906) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } }; + +static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE( + NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) }, + { AOM_CDF3(29600, 31446) }, + { AOM_CDF3(30844, 31878) }, + { AOM_CDF3(24926, 28948) } }, + { { AOM_CDF3(21365, 30026) }, + { AOM_CDF3(30512, 32423) }, + { AOM_CDF3(31658, 32621) }, + { AOM_CDF3(29630, 31881) } } }, + { { { AOM_CDF3(5717, 26477) }, + { AOM_CDF3(30491, 31703) }, + { AOM_CDF3(31550, 32158) }, + { AOM_CDF3(29648, 31491) } }, + { { AOM_CDF3(12608, 27820) }, + { AOM_CDF3(30680, 32225) }, + { AOM_CDF3(30809, 32335) }, + { AOM_CDF3(31299, 32423) } } }, + { { { AOM_CDF3(1786, 12612) }, + { AOM_CDF3(30663, 31625) }, + { AOM_CDF3(32339, 32468) }, + { AOM_CDF3(31148, 31833) } }, + { { AOM_CDF3(18857, 23865) }, + { AOM_CDF3(31428, 32428) }, + { AOM_CDF3(31744, 32373) }, + { AOM_CDF3(31775, 32526) } } }, + { { { AOM_CDF3(1787, 2532) }, + { AOM_CDF3(30832, 31662) }, + { AOM_CDF3(31824, 32682) }, + { AOM_CDF3(32133, 32569) } }, + { { AOM_CDF3(13751, 22235) }, + { AOM_CDF3(32089, 32409) }, + { AOM_CDF3(27084, 27920) }, + { AOM_CDF3(29291, 32594) } } }, + { { { AOM_CDF3(1725, 3449) }, + { AOM_CDF3(31102, 31935) }, + { AOM_CDF3(32457, 32613) }, + { AOM_CDF3(32412, 32649) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(17560, 29888) }, + { AOM_CDF3(29671, 31549) }, + { AOM_CDF3(31007, 32056) }, + { AOM_CDF3(27286, 30006) } }, + { { AOM_CDF3(26594, 31212) }, + { AOM_CDF3(31208, 32582) }, + { AOM_CDF3(31835, 32637) }, + { AOM_CDF3(30595, 32206) } } }, + { { { AOM_CDF3(15239, 29932) }, + { AOM_CDF3(31315, 32095) }, + { AOM_CDF3(32130, 32434) }, + { AOM_CDF3(30864, 31996) } }, + { { AOM_CDF3(26279, 30968) }, + { AOM_CDF3(31142, 32495) }, + { AOM_CDF3(31713, 32540) }, + { AOM_CDF3(31929, 32594) } } }, + { { { AOM_CDF3(2644, 25198) }, + { AOM_CDF3(32038, 32451) }, + { AOM_CDF3(32639, 32695) }, + { AOM_CDF3(32166, 32518) } }, + { { AOM_CDF3(17187, 27668) }, + { AOM_CDF3(31714, 32550) }, + { AOM_CDF3(32283, 32678) }, + { AOM_CDF3(31930, 32563) } } }, + { { { AOM_CDF3(1044, 2257) }, + { AOM_CDF3(30755, 31923) }, + { AOM_CDF3(32208, 32693) }, + { AOM_CDF3(32244, 32615) } }, + { { AOM_CDF3(21317, 26207) }, + { AOM_CDF3(29133, 30868) }, + { AOM_CDF3(29311, 31231) }, + { AOM_CDF3(29657, 31087) } } }, + { { { AOM_CDF3(478, 1834) }, + { AOM_CDF3(31005, 31987) }, + { AOM_CDF3(32317, 32724) }, + { AOM_CDF3(30865, 32648) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(20092, 30774) }, + { AOM_CDF3(30695, 32020) }, + { AOM_CDF3(31131, 32103) }, + { AOM_CDF3(28666, 30870) } }, + { { AOM_CDF3(27258, 31095) }, + { AOM_CDF3(31804, 32623) }, + { AOM_CDF3(31763, 32528) }, + { AOM_CDF3(31438, 32506) } } }, + { { { AOM_CDF3(18049, 30489) }, + { AOM_CDF3(31706, 32286) }, + { AOM_CDF3(32163, 32473) }, + { AOM_CDF3(31550, 32184) } }, + { { AOM_CDF3(27116, 30842) }, + { AOM_CDF3(31971, 32598) }, + { AOM_CDF3(32088, 32576) }, + { AOM_CDF3(32067, 32664) } } }, + { { { AOM_CDF3(12854, 29093) }, + { AOM_CDF3(32272, 32558) }, + { AOM_CDF3(32667, 32729) }, + { AOM_CDF3(32306, 32585) } }, + { { AOM_CDF3(25476, 30366) }, + { AOM_CDF3(32169, 32687) }, + { AOM_CDF3(32479, 32689) }, + { AOM_CDF3(31673, 32634) } } }, + { { { AOM_CDF3(2809, 19301) }, + { AOM_CDF3(32205, 32622) }, + { AOM_CDF3(32338, 32730) }, + { AOM_CDF3(31786, 32616) } }, + { { AOM_CDF3(22737, 29105) }, + { AOM_CDF3(30810, 32362) }, + { AOM_CDF3(30014, 32627) }, + { AOM_CDF3(30528, 32574) } } }, + { { { AOM_CDF3(935, 3382) }, + { AOM_CDF3(30789, 31909) }, + { AOM_CDF3(32466, 32756) }, + { AOM_CDF3(30860, 32513) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(22497, 31198) }, + { AOM_CDF3(31715, 32495) }, + { AOM_CDF3(31606, 32337) }, + { AOM_CDF3(30388, 31990) } }, + { { AOM_CDF3(27877, 31584) }, + { AOM_CDF3(32170, 32728) }, + { AOM_CDF3(32155, 32688) }, + { AOM_CDF3(32219, 32702) } } }, + { { { AOM_CDF3(21457, 31043) }, + { AOM_CDF3(31951, 32483) }, + { AOM_CDF3(32153, 32562) }, + { AOM_CDF3(31473, 32215) } }, + { { AOM_CDF3(27558, 31151) }, + { AOM_CDF3(32020, 32640) }, + { AOM_CDF3(32097, 32575) }, + { AOM_CDF3(32242, 32719) } } }, + { { { AOM_CDF3(19980, 30591) }, + { AOM_CDF3(32219, 32597) }, + { AOM_CDF3(32581, 32706) }, + { AOM_CDF3(31803, 32287) } }, + { { AOM_CDF3(26473, 30507) }, + { AOM_CDF3(32431, 32723) }, + { AOM_CDF3(32196, 32611) }, + { AOM_CDF3(31588, 32528) } } }, + { { { AOM_CDF3(24647, 30463) }, + { AOM_CDF3(32412, 32695) }, + { AOM_CDF3(32468, 32720) }, + { AOM_CDF3(31269, 32523) } }, + { { AOM_CDF3(28482, 31505) }, + { AOM_CDF3(32152, 32701) }, + { AOM_CDF3(31732, 32598) }, + { AOM_CDF3(31767, 32712) } } }, + { { { AOM_CDF3(12358, 24977) }, + { AOM_CDF3(31331, 32385) }, + { AOM_CDF3(32634, 32756) }, + { AOM_CDF3(30411, 32548) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } } }; + +#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_ diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c new file mode 100644 index 0000000000..bf2bc36b04 --- /dev/null +++ b/third_party/aom/av1/common/txb_common.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" + +// The ctx offset table when TX is TX_CLASS_2D. +// TX col and row indices are clamped to 4 + +const int8_t av1_nz_map_ctx_offset_4x4[16] = { + 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x8[64] = { + 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21, + 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x16[256] = { + 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x32[1024] = { + 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_4x8[32] = { + 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x16[128] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x32[512] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x16[512] = { + 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x64[1024] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, + 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_64x32[1024] = { + 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_4x16[64] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x4[64] = { + 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x32[256] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x8[256] = { + 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, + 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t *av1_nz_map_ctx_offset[19] = { + av1_nz_map_ctx_offset_4x4, // TX_4x4 + av1_nz_map_ctx_offset_8x8, // TX_8x8 + av1_nz_map_ctx_offset_16x16, // TX_16x16 + av1_nz_map_ctx_offset_32x32, // TX_32x32 + av1_nz_map_ctx_offset_32x32, // TX_64x64 + av1_nz_map_ctx_offset_4x8, // TX_4x8 + av1_nz_map_ctx_offset_16x4, // TX_8x4 + av1_nz_map_ctx_offset_8x16, // TX_8x16 + av1_nz_map_ctx_offset_32x8, // TX_16x8 + av1_nz_map_ctx_offset_16x32, // TX_16x32 + av1_nz_map_ctx_offset_32x16, // TX_32x16 + av1_nz_map_ctx_offset_32x64, // TX_32x64 + av1_nz_map_ctx_offset_64x32, // TX_64x32 + av1_nz_map_ctx_offset_4x16, // TX_4x16 + av1_nz_map_ctx_offset_16x4, // TX_16x4 + av1_nz_map_ctx_offset_8x32, // TX_8x32 + av1_nz_map_ctx_offset_32x8, // TX_32x8 + av1_nz_map_ctx_offset_32x64, // TX_16x64 + av1_nz_map_ctx_offset_32x16, // TX_64x16 +}; + +const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, + 17, 33, 65, 129, 257, 513 }; +const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h new file mode 100644 index 0000000000..9628090b63 --- /dev/null +++ b/third_party/aom/av1/common/txb_common.h @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TXB_COMMON_H_ +#define AOM_AV1_COMMON_TXB_COMMON_H_ + +#include "av1/common/av1_common_int.h" + +extern const int16_t av1_eob_group_start[12]; +extern const int16_t av1_eob_offset_bits[12]; + +extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL]; + +typedef struct txb_ctx { + int txb_skip_ctx; + int dc_sign_ctx; +} TXB_CTX; + +static const int base_level_count_to_index[13] = { + 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, +}; + +static const TX_CLASS tx_type_to_class[TX_TYPES] = { + TX_CLASS_2D, // DCT_DCT + TX_CLASS_2D, // ADST_DCT + TX_CLASS_2D, // DCT_ADST + TX_CLASS_2D, // ADST_ADST + TX_CLASS_2D, // FLIPADST_DCT + TX_CLASS_2D, // DCT_FLIPADST + TX_CLASS_2D, // FLIPADST_FLIPADST + TX_CLASS_2D, // ADST_FLIPADST + TX_CLASS_2D, // FLIPADST_ADST + TX_CLASS_2D, // IDTX + TX_CLASS_VERT, // V_DCT + TX_CLASS_HORIZ, // H_DCT + TX_CLASS_VERT, // V_ADST + TX_CLASS_HORIZ, // H_ADST + TX_CLASS_VERT, // V_FLIPADST + TX_CLASS_HORIZ, // H_FLIPADST +}; + +static INLINE int get_txb_bhl(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_high_log2[tx_size]; +} + +static INLINE int get_txb_wide(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_wide[tx_size]; +} + +static INLINE int get_txb_high(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_high[tx_size]; +} + +static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int height) { + return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR); +} + +static INLINE int get_padded_idx(const int idx, const int bhl) { + return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2); +} + +static INLINE int get_br_ctx_2d(const uint8_t *const levels, + const int c, // raster order + const int bhl) { + assert(c > 0); + const int col = c >> bhl; + const int row = c - (col << bhl); + const int stride = (1 << bhl) + TX_PAD_HOR; + const int pos = col * stride + row; + int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) + + AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) + + AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE); + mag = AOMMIN((mag + 1) >> 1, 6); + //((row | col) < 2) is equivalent to ((row < 2) && (col < 2)) + if ((row | col) < 2) return mag + 7; + return mag + 14; +} + +static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order + const int bhl, + const TX_CLASS tx_class) { + const int col = c >> bhl; + const int row = c - (col << bhl); + if (c == 0) return 0; + if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) || + (tx_class == TX_CLASS_HORIZ && col == 0) || + (tx_class == TX_CLASS_VERT && row == 0)) + return 7; + return 14; +} + +static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, + const int c, // raster order + const int bhl, const TX_CLASS tx_class) { + const int col = c >> bhl; + const int row = c - (col << bhl); + const int stride = (1 << bhl) + TX_PAD_HOR; + const int pos = col * stride + row; + int mag = levels[pos + 1]; + mag += levels[pos + stride]; + switch (tx_class) { + case TX_CLASS_2D: + mag += levels[pos + stride + 1]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if ((row < 2) && (col < 2)) return mag + 7; + break; + case TX_CLASS_HORIZ: + mag += levels[pos + (stride << 1)]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if (col == 0) return mag + 7; + break; + case TX_CLASS_VERT: + mag += levels[pos + 2]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if (row == 0) return mag + 7; + break; + default: break; + } + + return mag + 14; +} + +static const uint8_t clip_max3[256] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +}; + +static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels, + const int bhl, const TX_CLASS tx_class) { + int mag; + + // Note: AOMMIN(level, 3) is useless for decoder since level < 3. + mag = clip_max3[levels[(1 << bhl) + TX_PAD_HOR]]; // { 0, 1 } + mag += clip_max3[levels[1]]; // { 1, 0 } + + if (tx_class == TX_CLASS_2D) { + mag += clip_max3[levels[(1 << bhl) + TX_PAD_HOR + 1]]; // { 1, 1 } + mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 } + mag += clip_max3[levels[2]]; // { 2, 0 } + } else if (tx_class == TX_CLASS_VERT) { + mag += clip_max3[levels[2]]; // { 2, 0 } + mag += clip_max3[levels[3]]; // { 3, 0 } + mag += clip_max3[levels[4]]; // { 4, 0 } + } else { + mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 } + mag += clip_max3[levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]]; // { 0, 3 } + mag += clip_max3[levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]]; // { 0, 4 } + } + + return mag; +} + +#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D +#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5) +#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10) + +static const int nz_map_ctx_offset_1d[32] = { + NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, +}; + +static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats( + const int stats, + const int coeff_idx, // raster order + const int bhl, const TX_SIZE tx_size, const TX_CLASS tx_class) { + // tx_class == 0(TX_CLASS_2D) + if ((tx_class | coeff_idx) == 0) return 0; + int ctx = (stats + 1) >> 1; + ctx = AOMMIN(ctx, 4); + switch (tx_class) { + case TX_CLASS_2D: { + // This is the algorithm to generate av1_nz_map_ctx_offset[][] + // const int width = tx_size_wide[tx_size]; + // const int height = tx_size_high[tx_size]; + // if (width < height) { + // if (row < 2) return 11 + ctx; + // } else if (width > height) { + // if (col < 2) return 16 + ctx; + // } + // if (row + col < 2) return ctx + 1; + // if (row + col < 4) return 5 + ctx + 1; + // return 21 + ctx; + return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; + } + case TX_CLASS_HORIZ: { + const int col = coeff_idx >> bhl; + return ctx + nz_map_ctx_offset_1d[col]; + } + case TX_CLASS_VERT: { + const int col = coeff_idx >> bhl; + const int row = coeff_idx - (col << bhl); + return ctx + nz_map_ctx_offset_1d[row]; + } + default: break; + } + return 0; +} + +typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)]; +typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)]; + +static INLINE int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) { + if (scan_idx == 0) return 0; + if (scan_idx <= (width << bhl) / 8) return 1; + if (scan_idx <= (width << bhl) / 4) return 2; + return 3; +} + +static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx, + int bhl, TX_SIZE tx_size) { + assert(coeff_idx > 0); + int mag; + // Note: AOMMIN(level, 3) is useless for decoder since level < 3. + levels = levels + get_padded_idx(coeff_idx, bhl); + mag = AOMMIN(levels[(1 << bhl) + TX_PAD_HOR], 3); // { 0, 1 } + mag += AOMMIN(levels[1], 3); // { 1, 0 } + mag += AOMMIN(levels[(1 << bhl) + TX_PAD_HOR + 1], 3); // { 1, 1 } + mag += AOMMIN(levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)], 3); // { 0, 2 } + mag += AOMMIN(levels[2], 3); // { 2, 0 } + + const int ctx = AOMMIN((mag + 1) >> 1, 4); + return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; +} +static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels, + int coeff_idx, int bhl, + TX_SIZE tx_size, + TX_CLASS tx_class) { + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class); +} + +static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx, + int bhl, int width, + const uint8_t *levels, + int coeff_idx, TX_SIZE tx_size, + TX_CLASS tx_class) { + if (is_last) { + if (scan_idx == 0) return 0; + if (scan_idx <= (width << bhl) >> 3) return 1; + if (scan_idx <= (width << bhl) >> 2) return 2; + return 3; + } + return get_lower_levels_ctx(levels, coeff_idx, bhl, tx_size, tx_class); +} + +static INLINE void set_dc_sign(int *cul_level, int dc_val) { + if (dc_val < 0) + *cul_level |= 1 << COEFF_CONTEXT_BITS; + else if (dc_val > 0) + *cul_level += 2 << COEFF_CONTEXT_BITS; +} + +static void get_txb_ctx_general(const BLOCK_SIZE plane_bsize, + const TX_SIZE tx_size, const int plane, + const ENTROPY_CONTEXT *const a, + const ENTROPY_CONTEXT *const l, + TXB_CTX *const txb_ctx) { +#define MAX_TX_SIZE_UNIT 16 + static const int8_t signs[3] = { 0, -1, 1 }; + static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + }; + const int txb_w_unit = tx_size_wide_unit[tx_size]; + const int txb_h_unit = tx_size_high_unit[tx_size]; + int dc_sign = 0; + int k = 0; + + do { + const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; + assert(sign <= 2); + dc_sign += signs[sign]; + } while (++k < txb_w_unit); + + k = 0; + do { + const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; + assert(sign <= 2); + dc_sign += signs[sign]; + } while (++k < txb_h_unit); + + txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; + + if (plane == 0) { + if (plane_bsize == txsize_to_bsize[tx_size]) { + txb_ctx->txb_skip_ctx = 0; + } else { + // This is the algorithm to generate table skip_contexts[top][left]. + // const int max = AOMMIN(top | left, 4); + // const int min = AOMMIN(AOMMIN(top, left), 4); + // if (!max) + // txb_skip_ctx = 1; + // else if (!min) + // txb_skip_ctx = 2 + (max > 3); + // else if (max <= 3) + // txb_skip_ctx = 4; + // else if (min <= 3) + // txb_skip_ctx = 5; + // else + // txb_skip_ctx = 6; + static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 3, 5, 5, 5, 6 } }; + // For top and left, we only care about which of the following three + // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The + // spec calculates top and left with the Max() function. We can calculate + // an approximate max with bitwise OR because the real max and the + // approximate max belong to the same category. + int top = 0; + int left = 0; + + k = 0; + do { + top |= a[k]; + } while (++k < txb_w_unit); + top &= COEFF_CONTEXT_MASK; + top = AOMMIN(top, 4); + + k = 0; + do { + left |= l[k]; + } while (++k < txb_h_unit); + left &= COEFF_CONTEXT_MASK; + left = AOMMIN(left, 4); + + txb_ctx->txb_skip_ctx = skip_contexts[top][left]; + } + } else { + const int ctx_base = get_entropy_context(tx_size, a, l); + const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > + num_pels_log2_lookup[txsize_to_bsize[tx_size]]) + ? 10 + : 7; + txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; + } +} + +#define SPECIALIZE_GET_TXB_CTX(w, h) \ + static void get_txb_ctx_##w##x##h( \ + const BLOCK_SIZE plane_bsize, const int plane, \ + const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, \ + TXB_CTX *const txb_ctx) { \ + static const int8_t signs[3] = { 0, -1, 1 }; \ + static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, \ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 \ + }; \ + const TX_SIZE tx_size = TX_##w##X##h; \ + const int txb_w_unit = tx_size_wide_unit[tx_size]; \ + const int txb_h_unit = tx_size_high_unit[tx_size]; \ + int dc_sign = 0; \ + int k = 0; \ + \ + do { \ + const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; \ + assert(sign <= 2); \ + dc_sign += signs[sign]; \ + } while (++k < txb_w_unit); \ + \ + k = 0; \ + do { \ + const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; \ + assert(sign <= 2); \ + dc_sign += signs[sign]; \ + } while (++k < txb_h_unit); \ + \ + txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; \ + \ + if (plane == 0) { \ + if (plane_bsize == txsize_to_bsize[tx_size]) { \ + txb_ctx->txb_skip_ctx = 0; \ + } else { \ + static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, \ + { 2, 4, 4, 4, 5 }, \ + { 2, 4, 4, 4, 5 }, \ + { 2, 4, 4, 4, 5 }, \ + { 3, 5, 5, 5, 6 } }; \ + int top = 0; \ + int left = 0; \ + \ + k = 0; \ + do { \ + top |= a[k]; \ + } while (++k < txb_w_unit); \ + top &= COEFF_CONTEXT_MASK; \ + top = AOMMIN(top, 4); \ + \ + k = 0; \ + do { \ + left |= l[k]; \ + } while (++k < txb_h_unit); \ + left &= COEFF_CONTEXT_MASK; \ + left = AOMMIN(left, 4); \ + \ + txb_ctx->txb_skip_ctx = skip_contexts[top][left]; \ + } \ + } else { \ + const int ctx_base = get_entropy_context(tx_size, a, l); \ + const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > \ + num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \ + ? 10 \ + : 7; \ + txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; \ + } \ + } + +SPECIALIZE_GET_TXB_CTX(4, 4) +SPECIALIZE_GET_TXB_CTX(8, 8) +SPECIALIZE_GET_TXB_CTX(16, 16) +SPECIALIZE_GET_TXB_CTX(32, 32) + +// Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_* +// so that the compiler can compile away the while loops. +static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, + const TX_SIZE tx_size, const int plane, + const ENTROPY_CONTEXT *const a, + const ENTROPY_CONTEXT *const l, + TXB_CTX *const txb_ctx) { + switch (tx_size) { + case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break; + case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break; + case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break; + case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break; + default: + get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx); + break; + } +} +#undef MAX_TX_SIZE_UNIT + +#endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c new file mode 100644 index 0000000000..4282b92bfa --- /dev/null +++ b/third_party/aom/av1/common/warped_motion.c @@ -0,0 +1,918 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels +// at a time. The zoom/rotation/shear in the model are applied to the +// "fractional" position of each pixel, which therefore varies within +// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. +// We need an extra 2 taps to fit this in, for a total of 8 taps. +/* clang-format off */ +const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { + // [-1, 0) + { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, + { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, + { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 }, + { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 }, + { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 }, + { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 }, + { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 }, + { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 }, + { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 }, + { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 }, + { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 }, + { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 }, + { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 }, + { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 }, + { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 }, + { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 }, + { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 }, + { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 }, + { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 }, + { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 }, + { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 }, + { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 }, + { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 }, + { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 }, + { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 }, + { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 }, + { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 }, + { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 }, + { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 }, + { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 }, + { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 }, + { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 }, + + // [0, 1) + { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0}, + { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0}, + { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1}, + { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0}, + { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0}, + + // [1, 2) + { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 }, + { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 }, + { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 }, + { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 }, + { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 }, + { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 }, + { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 }, + { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 }, + { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 }, + { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 }, + { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 }, + { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 }, + { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 }, + { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 }, + { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 }, + { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 }, + { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 }, + { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 }, + { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 }, + { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 }, + { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 }, + { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 }, + { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 }, + { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 }, + { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 }, + { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 }, + { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 }, + { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 }, + { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 }, + { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 }, + { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 }, + { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 }, + // dummy (replicate row index 191) + { 0, 0, 0, 0, 2, 127, - 1, 0 }, +}; + +/* clang-format on */ + +#define DIV_LUT_PREC_BITS 14 +#define DIV_LUT_BITS 8 +#define DIV_LUT_NUM (1 << DIV_LUT_BITS) + +static const uint16_t div_lut[DIV_LUT_NUM + 1] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192, +}; + +// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned +// at precision of DIV_LUT_PREC_BITS along with the shift. +static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) { + int64_t f; + *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + // e is obtained from D after resetting the most significant 1 bit. + const int64_t e = D - ((uint64_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) { + int32_t f; + *shift = get_msb(D); + // e is obtained from D after resetting the most significant 1 bit. + const int32_t e = D - ((uint32_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int is_affine_valid(const WarpedMotionParams *const wm) { + const int32_t *mat = wm->wmmat; + return (mat[2] > 0); +} + +static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) || + (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS))) + return 0; + else + return 1; +} + +#ifndef NDEBUG +// Check that the given warp model satisfies the relevant constraints for +// its stated model type +static void check_model_consistency(WarpedMotionParams *wm) { + switch (wm->wmtype) { + case IDENTITY: + assert(wm->wmmat[0] == 0); + assert(wm->wmmat[1] == 0); + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS); + assert(wm->wmmat[3] == 0); + AOM_FALLTHROUGH_INTENDED; + case ROTZOOM: + assert(wm->wmmat[4] == -wm->wmmat[3]); + assert(wm->wmmat[5] == wm->wmmat[2]); + AOM_FALLTHROUGH_INTENDED; + case AFFINE: break; + default: assert(0 && "Bad wmtype"); + } +} +#endif // NDEBUG + +// Returns 1 on success or 0 on an invalid affine set +int av1_get_shear_params(WarpedMotionParams *wm) { +#ifndef NDEBUG + // Check that models have been constructed sensibly + // This is a good place to check, because this function does not need to + // be called until after model construction is complete, but must be called + // before the model can be used for prediction. + check_model_consistency(wm); +#endif // NDEBUG + + const int32_t *mat = wm->wmmat; + if (!is_affine_valid(wm)) return 0; + + wm->alpha = + clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX); + wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX); + int16_t shift; + int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1); + int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y; + wm->gamma = + clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX); + v = ((int64_t)mat[3] * mat[4]) * y; + wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) - + (1 << WARPEDMODEL_PREC_BITS), + INT16_MIN, INT16_MAX); + + wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + + if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta)) + return 0; + + return 1; +} + +#if CONFIG_AV1_HIGHBITDEPTH +/* Note: For an explanation of the warp algorithm, and some notes on bit widths + for hardware implementations, see the comments above av1_warp_affine_c +*/ +void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, uint16_t *pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, + int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + for (int i = p_row; i < p_row + p_height; i += 8) { + for (int j = p_col; j < p_col + p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4) << subsampling_x; + const int32_t src_y = (i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (int k = -7; k < 8; ++k) { + const int iy = clamp(iy4 + k, 0, height - 1); + + int sx = sx4 + beta * (k + 4); + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + const int sample_x = clamp(ix + m, 0, width - 1); + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + assert(0 <= sum && sum < (1 << max_bits_horiz)); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (int m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + if (conv_params->is_compound) { + CONV_BUF_TYPE *p = + &conv_params + ->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + if (conv_params->do_average) { + uint16_t *dst16 = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + int32_t tmp32 = *p; + if (conv_params->use_dist_wtd_comp_avg) { + tmp32 = tmp32 * conv_params->fwd_offset + + sum * conv_params->bck_offset; + tmp32 = tmp32 >> DIST_PRECISION_BITS; + } else { + tmp32 += sum; + tmp32 = tmp32 >> 1; + } + tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1)); + *dst16 = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd); + } else { + *p = sum; + } + } else { + uint16_t *p = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + assert(0 <= sum && sum < (1 << (bd + 2))); + *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd); + } + sy += gamma; + } + } + } + } +} + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params) { + const int32_t *const mat = wm->wmmat; + const int16_t alpha = wm->alpha; + const int16_t beta = wm->beta; + const int16_t gamma = wm->gamma; + const int16_t delta = wm->delta; + + av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +/* The warp filter for ROTZOOM and AFFINE models works as follows: + * Split the input into 8x8 blocks + * For each block, project the point (4, 4) within the block, to get the + overall block position. Split into integer and fractional coordinates, + maintaining full WARPEDMODEL precision + * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a + variable horizontal offset. This means that, while the rows of the + intermediate buffer align with the rows of the *reference* image, the + columns align with the columns of the *destination* image. + * Filter vertically: Generate the output block (up to 8x8 pixels, but if the + destination is too small we crop the output at this stage). Each pixel has + a variable vertical offset, so that the resulting rows are aligned with + the rows of the destination image. + + To accomplish these alignments, we factor the warp matrix as a + product of two shear / asymmetric zoom matrices: + / a b \ = / 1 0 \ * / 1+alpha beta \ + \ c d / \ gamma 1+delta / \ 0 1 / + where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively. + The horizontal shear (with alpha and beta) is applied first, + then the vertical shear (with gamma and delta) is applied second. + + The only limitation is that, to fit this in a fixed 8-tap filter size, + the fractional pixel offsets must be at most +-1. Since the horizontal filter + generates 15 rows of 8 columns, and the initial point we project is at (4, 4) + within the block, the parameters must satisfy + 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1 + for this filter to be applicable. + + Note: This function assumes that the caller has done all of the relevant + checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5] + are set appropriately (if using a ROTZOOM model), and that alpha, beta, + gamma, delta are all in range. + + TODO(rachelbarker): Maybe support scaled references? +*/ +/* A note on hardware implementation: + The warp filter is intended to be implementable using the same hardware as + the high-precision convolve filters from the loop-restoration and + convolve-round experiments. + + For a single filter stage, considering all of the coefficient sets for the + warp filter and the regular convolution filter, an input in the range + [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)] + before rounding. + + Allowing for some changes to the filter coefficient sets, call the range + [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k, + we can replace this by the range [0, 256 * 2^k], which can be stored in an + unsigned value with 8 + k bits. + + This allows the derivation of the appropriate bit widths and offsets for + the various intermediate values: If + + F := FILTER_BITS = 7 (or else the above ranges need adjusting) + So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit + intermediate value. + H := ROUND0_BITS + V := VERSHEAR_REDUCE_PREC_BITS + (and note that we must have H + V = 2*F for the output to have the same + scale as the input) + + then we end up with the following offsets and ranges: + Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a + uint{bd + F + 1} + After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}. + Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a + uint{bd + 2*F + 2 - H} + After rounding: The final value, before undoing the offset, fits into a + uint{bd + 2}. + + Then we need to undo the offsets before clamping to a pixel. Note that, + if we do this at the end, the amount to subtract is actually independent + of H and V: + + offset to subtract = (1 << ((bd + F - 1) - H + F - V)) + + (1 << ((bd + 2*F - H) - V)) + == (1 << (bd - 1)) + (1 << bd) + + This allows us to entirely avoid clamping in both the warp filter and + the convolve-round experiment. As of the time of writing, the Wiener filter + from loop-restoration can encode a central coefficient up to 216, which + leads to a maximum value of about 282 * 2^k after applying the offset. + So in that case we still need to clamp. +*/ +void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + for (int i = p_row; i < p_row + p_height; i += 8) { + for (int j = p_col; j < p_col + p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4) << subsampling_x; + const int32_t src_y = (i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (int k = -7; k < 8; ++k) { + // Clamp to top/bottom edge of the frame + const int iy = clamp(iy4 + k, 0, height - 1); + + int sx = sx4 + beta * (k + 4); + + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + // At this point, sx = sx4 + alpha * l + beta * k + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + // Clamp to left/right edge of the frame + const int sample_x = clamp(ix + m, 0, width - 1); + + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + assert(0 <= sum && sum < (1 << max_bits_horiz)); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + // At this point, sy = sy4 + gamma * l + delta * k + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (int m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + if (conv_params->is_compound) { + CONV_BUF_TYPE *p = + &conv_params + ->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + if (conv_params->do_average) { + uint8_t *dst8 = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + int32_t tmp32 = *p; + if (conv_params->use_dist_wtd_comp_avg) { + tmp32 = tmp32 * conv_params->fwd_offset + + sum * conv_params->bck_offset; + tmp32 = tmp32 >> DIST_PRECISION_BITS; + } else { + tmp32 += sum; + tmp32 = tmp32 >> 1; + } + tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1)); + *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits)); + } else { + *p = sum; + } + } else { + uint8_t *p = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + assert(0 <= sum && sum < (1 << (bd + 2))); + *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd)); + } + sy += gamma; + } + } + } + } +} + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { + const int32_t *const mat = wm->wmmat; + const int16_t alpha = wm->alpha; + const int16_t beta = wm->beta; + const int16_t gamma = wm->gamma; + const int16_t delta = wm->delta; + av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params, + alpha, beta, gamma, delta); +} + +void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride, + CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, bd, + conv_params); + else + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#else + (void)use_hbd; + (void)bd; + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#endif +} + +#define LS_MV_MAX 256 // max mv in 1/8-pel +// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By. +#define LS_STEP 8 + +// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8, +// the precision needed is: +// (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] + +// (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] + +// 1 [for sign] + +// LEAST_SQUARES_SAMPLES_MAX_BITS +// [for adding up to LEAST_SQUARES_SAMPLES_MAX samples] +// The value is 23 +#define LS_MAT_RANGE_BITS \ + ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS) + +// Bit-depth reduction from the full-range +#define LS_MAT_DOWN_BITS 2 + +// bits range of A, Bx and By after downshifting +#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS) +#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1))) +#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1) + +// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are +// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here. +#define LS_SQUARE(a) \ + (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ + (2 + LS_MAT_DOWN_BITS)) +#define LS_PRODUCT1(a, b) \ + (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \ + (2 + LS_MAT_DOWN_BITS)) +#define LS_PRODUCT2(a, b) \ + (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ + (2 + LS_MAT_DOWN_BITS)) + +#define USE_LIMITED_PREC_MULT 0 + +#if USE_LIMITED_PREC_MULT + +#define MUL_PREC_BITS 16 +static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) { + int msb = 0; + uint16_t mult = 0; + *shift = 0; + if (D != 0) { + msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + if (msb >= MUL_PREC_BITS) { + mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS); + *shift = msb + 1 - MUL_PREC_BITS; + } else { + mult = (uint16_t)D; + *shift = 0; + } + } + return mult; +} + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int32_t ret; + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp(v * (1 << (-shift)), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } + return ret; +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp( + ROUND_POWER_OF_TWO_SIGNED(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp( + v * (1 << (-shift)), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } +} + +#else + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64( + ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} +#endif // USE_LIMITED_PREC_MULT + +static int find_affine_int(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm, int mi_row, int mi_col) { + int32_t A[2][2] = { { 0, 0 }, { 0, 0 } }; + int32_t Bx[2] = { 0, 0 }; + int32_t By[2] = { 0, 0 }; + + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int rsuy = bh / 2 - 1; + const int rsux = bw / 2 - 1; + const int suy = rsuy * 8; + const int sux = rsux * 8; + const int duy = suy + mvy; + const int dux = sux + mvx; + + // Assume the center pixel of the block has exactly the same motion vector + // as transmitted for the block. First shift the origin of the source + // points to the block center, and the origin of the destination points to + // the block center added to the motion vector transmitted. + // Let (xi, yi) denote the source points and (xi', yi') denote destination + // points after origin shfifting, for i = 0, 1, 2, .... n-1. + // Then if P = [x0, y0, + // x1, y1 + // x2, y1, + // .... + // ] + // q = [x0', x1', x2', ... ]' + // r = [y0', y1', y2', ... ]' + // the least squares problems that need to be solved are: + // [h1, h2]' = inv(P'P)P'q and + // [h3, h4]' = inv(P'P)P'r + // where the affine transformation is given by: + // x' = h1.x + h2.y + // y' = h3.x + h4.y + // + // The loop below computes: A = P'P, Bx = P'q, By = P'r + // We need to just compute inv(A).Bx and inv(A).By for the solutions. + // Contribution from neighbor block + for (int i = 0; i < np; i++) { + const int dx = pts2[i * 2] - dux; + const int dy = pts2[i * 2 + 1] - duy; + const int sx = pts1[i * 2] - sux; + const int sy = pts1[i * 2 + 1] - suy; + // (TODO)yunqing: This comparison wouldn't be necessary if the sample + // selection is done in find_samples(). Also, global offset can be removed + // while collecting samples. + if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) { + A[0][0] += LS_SQUARE(sx); + A[0][1] += LS_PRODUCT1(sx, sy); + A[1][1] += LS_SQUARE(sy); + Bx[0] += LS_PRODUCT2(sx, dx); + Bx[1] += LS_PRODUCT1(sy, dx); + By[0] += LS_PRODUCT1(sx, dy); + By[1] += LS_PRODUCT2(sy, dy); + } + } + + // Just for debugging, and can be removed later. + assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX); + assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX); + assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX); + assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX); + assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX); + assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX); + assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX); + + // Compute Determinant of A + const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1]; + if (Det == 0) return 1; + + int16_t shift; + int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1); + shift -= WARPEDMODEL_PREC_BITS; + if (shift < 0) { + iDet <<= (-shift); + shift = 0; + } + + int64_t Px[2], Py[2]; + // These divided by the Det, are the least squares solutions + Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1]; + Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1]; + Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1]; + Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1]; + + wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift); + wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift); + wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); + wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); + + const int isuy = (mi_row * MI_SIZE + rsuy); + const int isux = (mi_col * MI_SIZE + rsux); + // Note: In the vx, vy expressions below, the max value of each of the + // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room + // for the first term so that the overall sum in the worst case fits + // within 32 bits overall. + const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + + isuy * wm->wmmat[3]); + const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * wm->wmmat[4] + + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); + wm->wmmat[0] = + clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + wm->wmmat[1] = + clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + return 0; +} + +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col) { + assert(wm_params->wmtype == AFFINE); + + if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row, + mi_col)) + return 1; + + // check compatibility with the fast warp filter + if (!av1_get_shear_params(wm_params)) return 1; + + return 0; +} diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h new file mode 100644 index 0000000000..d772df8873 --- /dev/null +++ b/third_party/aom/av1/common/warped_motion.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_ +#define AOM_AV1_COMMON_WARPED_MOTION_H_ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/mv.h" +#include "av1/common/convolve.h" + +#define LEAST_SQUARES_SAMPLES_MAX_BITS 3 +#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS) +#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2) +#define WARPED_MOTION_DEBUG 0 +#define DEFAULT_WMTYPE AFFINE +#define WARP_ERROR_BLOCK_LOG 5 +#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) + +extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; + +DECLARE_ALIGNED(8, extern const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); + +static const uint8_t warp_pad_left[14][16] = { + { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 }, + { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 }, + { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 }, + { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 }, + { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 }, + { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 }, +}; + +static const uint8_t warp_pad_right[14][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 }, + { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 }, + { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, + { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }, + { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } +}; + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params); + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + +void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col); + +int av1_get_shear_params(WarpedMotionParams *wm); +#endif // AOM_AV1_COMMON_WARPED_MOTION_H_ diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c new file mode 100644 index 0000000000..8aa14696f6 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "av1/common/resize.h" +#include "aom_dsp/x86/synonyms.h" + +// Note: If the crop width is not a multiple of 4, then, unlike the C version, +// this function will overwrite some of the padding on the right hand side of +// the frame. This padding appears to be trashed anyway, so this should not +// affect the running of the decoder. +void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + assert(UPSCALE_NORMATIVE_TAPS == 8); + + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + + const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + const uint8_t *src_y; + uint8_t *dst_y; + int x_qn = x0_qn; + for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { + const int x_filter_idx0 = + ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx1 = + ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx2 = + ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx3 = + ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + + assert(x_filter_idx0 <= RS_SUBPEL_MASK); + assert(x_filter_idx1 <= RS_SUBPEL_MASK); + assert(x_filter_idx2 <= RS_SUBPEL_MASK); + assert(x_filter_idx3 <= RS_SUBPEL_MASK); + + const int16_t *const x_filter0 = + &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter1 = + &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter2 = + &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter3 = + &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; + + const __m128i fil0_16 = xx_loadu_128(x_filter0); + const __m128i fil1_16 = xx_loadu_128(x_filter1); + const __m128i fil2_16 = xx_loadu_128(x_filter2); + const __m128i fil3_16 = xx_loadu_128(x_filter3); + + src_y = src; + dst_y = dst; + for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { + const uint8_t *const src_x0 = + &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x1 = + &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x2 = + &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x3 = + &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + + // Load up the source data. This is 8-bit input data, so each load + // gets 8 pixels. + const __m128i src0_8 = xx_loadl_64(src_x0); + const __m128i src1_8 = xx_loadl_64(src_x1); + const __m128i src2_8 = xx_loadl_64(src_x2); + const __m128i src3_8 = xx_loadl_64(src_x3); + + // Now zero-extend up to 16-bit precision, i.e. + // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ] + const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8); + const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8); + const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8); + const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8); + + // Multiply by filter coefficients (results in a 32-bit value), + // and add adjacent pairs, i.e. + // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) + // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] + const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); + const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); + const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); + const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); + + // Reduce horizontally and add, i.e. + // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] + const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); + const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); + + const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); + + // Divide down by (1 << FILTER_BITS), rounding to nearest. + const __m128i shifted_32 = + _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); + + // Pack 32-bit values into 16-bit values, i.e. + // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] + const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); + + // Pack 16-bit values into 8-bit values, i.e. + // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ]) + // -> [ 0 0 0 0 0 0 DC BA ] + const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero); + + // Write to the output + xx_storel_32(&dst_y[x], shifted_8); + } + } +} + +// Note: If the crop width is not a multiple of 4, then, unlike the C version, +// this function will overwrite some of the padding on the right hand side of +// the frame. This padding appears to be trashed anyway, so this should not +// affect the running of the decoder. +void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + int x0_qn, int x_step_qn, int bd) { + assert(UPSCALE_NORMATIVE_TAPS == 8); + assert(bd == 8 || bd == 10 || bd == 12); + + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + + const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1); + + const uint16_t *src_y; + uint16_t *dst_y; + int x_qn = x0_qn; + for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { + const int x_filter_idx0 = + ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx1 = + ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx2 = + ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx3 = + ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + + assert(x_filter_idx0 <= RS_SUBPEL_MASK); + assert(x_filter_idx1 <= RS_SUBPEL_MASK); + assert(x_filter_idx2 <= RS_SUBPEL_MASK); + assert(x_filter_idx3 <= RS_SUBPEL_MASK); + + const int16_t *const x_filter0 = + &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter1 = + &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter2 = + &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter3 = + &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; + + const __m128i fil0_16 = xx_loadu_128(x_filter0); + const __m128i fil1_16 = xx_loadu_128(x_filter1); + const __m128i fil2_16 = xx_loadu_128(x_filter2); + const __m128i fil3_16 = xx_loadu_128(x_filter3); + + src_y = src; + dst_y = dst; + for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { + const uint16_t *const src_x0 = + &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x1 = + &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x2 = + &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x3 = + &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + + // Load up the source data. This is 16-bit input data, so each load + // gets 8 pixels. + const __m128i src0_16 = xx_loadu_128(src_x0); + const __m128i src1_16 = xx_loadu_128(src_x1); + const __m128i src2_16 = xx_loadu_128(src_x2); + const __m128i src3_16 = xx_loadu_128(src_x3); + + // Multiply by filter coefficients (results in a 32-bit value), + // and add adjacent pairs, i.e. + // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) + // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] + const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); + const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); + const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); + const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); + + // Reduce horizontally and add, i.e. + // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] + const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); + const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); + + const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); + + // Divide down by (1 << FILTER_BITS), rounding to nearest. + const __m128i shifted_32 = + _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); + + // Pack 32-bit values into 16-bit values, i.e. + // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] + const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); + + // Clip the values at (1 << bd) - 1 + const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum); + + // Write to the output + xx_storel_64(&dst_y[x], clipped_16); + } + } +} diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c new file mode 100644 index 0000000000..8e293b5bb1 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +// A specialised version of hfilter, the horizontal filter for +// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. +static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w, + int h, int subpel_x_qn, int x_step_qn, + const InterpFilterParams *filter_params, int round) { + const int bd = 8; + const int ntaps = 8; + + src -= ntaps / 2 - 1; + + int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); + const __m128i round_add = _mm_set1_epi32(round_add32); + const __m128i round_shift = _mm_cvtsi32_si128(round); + + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + // Load the filter coefficients + const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); + const __m128i zero = _mm_castps_si128(_mm_setzero_ps()); + + int y; + for (y = 0; y <= h - 4; y += 4) { + const uint8_t *const src0 = src_col + y * src_stride; + const uint8_t *const src1 = src0 + 1 * src_stride; + const uint8_t *const src2 = src0 + 2 * src_stride; + const uint8_t *const src3 = src0 + 3 * src_stride; + + // Load up source data. This is 8-bit input data; each load is just + // loading the lower half of the register and gets 8 pixels + const __m128i data08 = _mm_loadl_epi64((__m128i *)src0); + const __m128i data18 = _mm_loadl_epi64((__m128i *)src1); + const __m128i data28 = _mm_loadl_epi64((__m128i *)src2); + const __m128i data38 = _mm_loadl_epi64((__m128i *)src3); + + // Now zero-extend up to 16-bit precision by interleaving with + // zeros. Drop the upper half of each register (which just had zeros) + const __m128i data0lo = _mm_unpacklo_epi8(data08, zero); + const __m128i data1lo = _mm_unpacklo_epi8(data18, zero); + const __m128i data2lo = _mm_unpacklo_epi8(data28, zero); + const __m128i data3lo = _mm_unpacklo_epi8(data38, zero); + + // Multiply by coefficients + const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); + const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); + const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); + const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); + + // Reduce horizontally and add + const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); + const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); + const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); + + // Divide down by (1 << round), rounding to nearest. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); + + shifted = _mm_packus_epi32(shifted, shifted); + // Write transposed to the output + _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); + } + for (; y < h; ++y) { + const uint8_t *const src_row = src_col + y * src_stride; + + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < ntaps; ++k) { + sum += filter[k] * src_row[k]; + } + + dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); + } + } +} + +static __m128i convolve_16_8(const int16_t *src, __m128i coeff) { + __m128i data = _mm_loadu_si128((__m128i *)src); + return _mm_madd_epi16(data, coeff); +} + +// A specialised version of vfilter, the vertical filter for +// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. +static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, int subpel_y_qn, + int y_step_qn, const InterpFilterParams *filter_params, + const ConvolveParams *conv_params, int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int ntaps = 8; + + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + const __m128i sub = _mm_set1_epi16(sub32); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i bits_shift = _mm_cvtsi32_si128(bits); + const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1)); + const __m128i round_shift_add = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16((short)w0); + const __m128i wt1 = _mm_set1_epi16((short)w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); + int x; + for (x = 0; x <= w - 4; x += 4) { + const int16_t *const src0 = src_y + x * src_stride; + const int16_t *const src1 = src0 + 1 * src_stride; + const int16_t *const src2 = src0 + 2 * src_stride; + const int16_t *const src3 = src0 + 3 * src_stride; + + // Load the source data for the three rows, adding the three registers of + // convolved products to one as we go (conv0..conv3) to avoid the + // register pressure getting too high. + const __m128i conv0 = convolve_16_8(src0, coeff0716); + const __m128i conv1 = convolve_16_8(src1, coeff0716); + const __m128i conv2 = convolve_16_8(src2, coeff0716); + const __m128i conv3 = convolve_16_8(src3, coeff0716); + + // Now reduce horizontally to get one lane for each result + const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); + const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); + __m128i conv = _mm_hadd_epi32(conv01, conv23); + + conv = _mm_add_epi32(conv, res_add_const); + // Divide down by (1 << round_1), rounding to nearest and subtract sub32. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); + + uint8_t *dst_x = dst + y * dst_stride + x; + __m128i result; + __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); + + if (conv_params->is_compound) { + CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; + if (conv_params->do_average) { + const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + shifted_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1); + } + const __m128i subbed = _mm_sub_epi16(shifted_16, sub); + result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); + const __m128i result_8 = _mm_packus_epi16(result, result); + *(int *)dst_x = _mm_cvtsi128_si32(result_8); + } else { + _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); + } + } else { + const __m128i subbed = _mm_sub_epi16(shifted_16, sub); + result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); + const __m128i result_8 = _mm_packus_epi16(result, result); + *(int *)dst_x = _mm_cvtsi128_si32(result_8); + } + } + for (; x < w; ++x) { + const int16_t *src_x = src_y + x * src_stride; + int32_t sum = 1 << offset_bits; + for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - sub32; + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } + } + } +} +void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + + const int xtaps = filter_params_x->taps; + const int ytaps = filter_params_y->taps; + const int fo_vert = ytaps / 2 - 1; + assert((xtaps == 8) && (ytaps == 8)); + (void)xtaps; + + // horizontal filter + hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn, + x_step_qn, filter_params_x, conv_params->round_0); + + // vertical filter (input is transposed) + vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params, 8); +} + +// A specialised version of hfilter, the horizontal filter for +// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap +// filters. +static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst, + int w, int h, int subpel_x_qn, int x_step_qn, + const InterpFilterParams *filter_params, int round, + int bd) { + const int ntaps = 8; + + src -= ntaps / 2 - 1; + + int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); + const __m128i round_add = _mm_set1_epi32(round_add32); + const __m128i round_shift = _mm_cvtsi32_si128(round); + + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + // Load the filter coefficients + const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); + + int y; + for (y = 0; y <= h - 4; y += 4) { + const uint16_t *const src0 = src_col + y * src_stride; + const uint16_t *const src1 = src0 + 1 * src_stride; + const uint16_t *const src2 = src0 + 2 * src_stride; + const uint16_t *const src3 = src0 + 3 * src_stride; + + // Load up source data. This is 16-bit input data, so each load gets the 8 + // pixels we need. + const __m128i data0lo = _mm_loadu_si128((__m128i *)src0); + const __m128i data1lo = _mm_loadu_si128((__m128i *)src1); + const __m128i data2lo = _mm_loadu_si128((__m128i *)src2); + const __m128i data3lo = _mm_loadu_si128((__m128i *)src3); + + // Multiply by coefficients + const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); + const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); + const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); + const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); + + // Reduce horizontally and add + const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); + const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); + const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); + + // Divide down by (1 << round), rounding to nearest. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); + + shifted = _mm_packus_epi32(shifted, shifted); + // Write transposed to the output + _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); + } + for (; y < h; ++y) { + const uint16_t *const src_row = src_col + y * src_stride; + + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < ntaps; ++k) { + sum += filter[k] * src_row[k]; + } + + dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); + } + } +} +// A specialised version of vfilter, the vertical filter for +// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap +// filters. +static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, + int dst_stride, int w, int h, int subpel_y_qn, + int y_step_qn, + const InterpFilterParams *filter_params, + const ConvolveParams *conv_params, int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int ntaps = 8; + + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + const __m128i sub = _mm_set1_epi32(sub32); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const __m128i clip_pixel_ = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i bits_shift = _mm_cvtsi32_si128(bits); + const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1)); + const __m128i round_shift_add = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); + int x; + for (x = 0; x <= w - 4; x += 4) { + const int16_t *const src0 = src_y + x * src_stride; + const int16_t *const src1 = src0 + 1 * src_stride; + const int16_t *const src2 = src0 + 2 * src_stride; + const int16_t *const src3 = src0 + 3 * src_stride; + + // Load the source data for the three rows, adding the three registers of + // convolved products to one as we go (conv0..conv3) to avoid the + // register pressure getting too high. + const __m128i conv0 = convolve_16_8(src0, coeff0716); + const __m128i conv1 = convolve_16_8(src1, coeff0716); + const __m128i conv2 = convolve_16_8(src2, coeff0716); + const __m128i conv3 = convolve_16_8(src3, coeff0716); + + // Now reduce horizontally to get one lane for each result + const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); + const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); + __m128i conv = _mm_hadd_epi32(conv01, conv23); + conv = _mm_add_epi32(conv, res_add_const); + + // Divide down by (1 << round_1), rounding to nearest and subtract sub32. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); + + uint16_t *dst_x = dst + y * dst_stride + x; + + __m128i result; + if (conv_params->is_compound) { + CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; + if (conv_params->do_average) { + __m128i p_32 = + _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); + + if (conv_params->use_dist_wtd_comp_avg) { + shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), + _mm_mullo_epi32(shifted, wt1)); + shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); + } else { + shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1); + } + result = _mm_sub_epi32(shifted, sub); + result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const), + round_bits_shift); + + result = _mm_packus_epi32(result, result); + result = _mm_min_epi16(result, clip_pixel_); + _mm_storel_epi64((__m128i *)dst_x, result); + } else { + __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); + _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); + } + } else { + result = _mm_sub_epi32(shifted, sub); + result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift); + result = _mm_packus_epi32(result, result); + result = _mm_min_epi16(result, clip_pixel_); + _mm_storel_epi64((__m128i *)dst_x, result); + } + } + + for (; x < w; ++x) { + const int16_t *src_x = src_y + x * src_stride; + int32_t sum = 1 << offset_bits; + for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } + } + } +} + +void av1_highbd_convolve_2d_scale_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + // TODO(yaowu): Move this out of stack + DECLARE_ALIGNED(16, int16_t, + tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + const int xtaps = filter_params_x->taps; + const int ytaps = filter_params_y->taps; + const int fo_vert = ytaps / 2 - 1; + + memset(tmp, 0, sizeof(tmp)); + assert((xtaps == 8) && (ytaps == 8)); + (void)xtaps; + + // horizontal filter + highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, + subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0, + bd); + + // vertical filter (input is transposed) + highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params, bd); +} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c new file mode 100644 index 0000000000..0afd42b170 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -0,0 +1,2254 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_inv_txfm_avx2.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" + +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); +} + +static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); +} + +static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { + btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); + btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); + btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); + btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); + btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); + btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); + btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); + btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); +} + +static void idct16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = input[8]; + x1[2] = input[4]; + x1[3] = input[12]; + x1[4] = input[2]; + x1[5] = input[10]; + x1[6] = input[6]; + x1[7] = input[14]; + x1[8] = input[1]; + x1[9] = input[9]; + x1[10] = input[5]; + x1[11] = input[13]; + x1[12] = input[3]; + x1[13] = input[11]; + x1[14] = input[7]; + x1[15] = input[15]; + + // stage 2 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, + INV_COS_BIT); + + // stage 3 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, + INV_COS_BIT); + + idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[2] = input[4]; + x1[4] = input[2]; + x1[6] = input[6]; + x1[8] = input[1]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); + + // stage 3 + btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, + INV_COS_BIT); + + idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x1[2]; + x1[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + + // stage 5 + // stage 6 + output[0] = x1[0]; + output[1] = x1[1]; + output[2] = x1[1]; + output[3] = x1[0]; + output[4] = x1[0]; + output[5] = x1[1]; + output[6] = x1[1]; + output[7] = x1[0]; + output[8] = x1[0]; + output[9] = x1[1]; + output[10] = x1[1]; + output[11] = x1[0]; + output[12] = x1[0]; + output[13] = x1[1]; + output[14] = x1[1]; + output[15] = x1[0]; +} + +static INLINE void iadst16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[8]); + btf_16_adds_subs_avx2(&x[1], &x[9]); + btf_16_adds_subs_avx2(&x[2], &x[10]); + btf_16_adds_subs_avx2(&x[3], &x[11]); + btf_16_adds_subs_avx2(&x[4], &x[12]); + btf_16_adds_subs_avx2(&x[5], &x[13]); + btf_16_adds_subs_avx2(&x[6], &x[14]); + btf_16_adds_subs_avx2(&x[7], &x[15]); +} + +static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); +} + +static INLINE void iadst16_stage5_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[4]); + btf_16_adds_subs_avx2(&x[1], &x[5]); + btf_16_adds_subs_avx2(&x[2], &x[6]); + btf_16_adds_subs_avx2(&x[3], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[12]); + btf_16_adds_subs_avx2(&x[9], &x[13]); + btf_16_adds_subs_avx2(&x[10], &x[14]); + btf_16_adds_subs_avx2(&x[11], &x[15]); +} + +static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); +} + +static INLINE void iadst16_stage7_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[2]); + btf_16_adds_subs_avx2(&x[1], &x[3]); + btf_16_adds_subs_avx2(&x[4], &x[6]); + btf_16_adds_subs_avx2(&x[5], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[10]); + btf_16_adds_subs_avx2(&x[9], &x[11]); + btf_16_adds_subs_avx2(&x[12], &x[14]); + btf_16_adds_subs_avx2(&x[13], &x[15]); +} + +static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); +} + +static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { + const __m256i __zero = _mm256_setzero_si256(); + output[0] = x1[0]; + output[1] = _mm256_subs_epi16(__zero, x1[8]); + output[2] = x1[12]; + output[3] = _mm256_subs_epi16(__zero, x1[4]); + output[4] = x1[6]; + output[5] = _mm256_subs_epi16(__zero, x1[14]); + output[6] = x1[10]; + output[7] = _mm256_subs_epi16(__zero, x1[2]); + output[8] = x1[3]; + output[9] = _mm256_subs_epi16(__zero, x1[11]); + output[10] = x1[15]; + output[11] = _mm256_subs_epi16(__zero, x1[7]); + output[12] = x1[5]; + output[13] = _mm256_subs_epi16(__zero, x1[13]); + output[14] = x1[9]; + output[15] = _mm256_subs_epi16(__zero, x1[1]); +} + +static void iadst16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[15]; + x1[1] = input[0]; + x1[2] = input[13]; + x1[3] = input[2]; + x1[4] = input[11]; + x1[5] = input[4]; + x1[6] = input[9]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[9] = input[8]; + x1[10] = input[5]; + x1[11] = input[10]; + x1[12] = input[3]; + x1[13] = input[12]; + x1[14] = input[1]; + x1[15] = input[14]; + + // stage 2 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, + INV_COS_BIT); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + x1[3] = input[2]; + x1[5] = input[4]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[1]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); + btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); + btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); + btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); + btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); + btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); + btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + + // stage 3 + x1[8] = x1[0]; + x1[9] = x1[1]; + + // stage 4 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, + INV_COS_BIT); + + // stage 5 + x1[4] = x1[0]; + x1[5] = x1[1]; + + x1[12] = x1[8]; + x1[13] = x1[9]; + + // stage 6 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, + INV_COS_BIT); + + // stage 7 + x1[2] = x1[0]; + x1[3] = x1[1]; + x1[6] = x1[4]; + x1[7] = x1[5]; + x1[10] = x1[8]; + x1[11] = x1[9]; + x1[14] = x1[12]; + x1[15] = x1[13]; + + iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage9_avx2(output, x1); +} + +static INLINE void idct32_high16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); +} + +static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); +} + +static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); +} + +static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); +} + +static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); +} + +static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); +} + +static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); + btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); + btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); + btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); + btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); + btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); + btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); + btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); + btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); + btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); + btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); + btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); + btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); + btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); + btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); + btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); +} + +static void idct32_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); + + idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage9_avx2(output, x); +} + +static void idct32_low16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_avx2(x); + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); + + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); + + idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage9_avx2(output, x); +} + +static void idct32_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + + // stage 1 + __m256i x1[32]; + x1[0] = input[0]; + x1[1] = input[16]; + x1[2] = input[8]; + x1[3] = input[24]; + x1[4] = input[4]; + x1[5] = input[20]; + x1[6] = input[12]; + x1[7] = input[28]; + x1[8] = input[2]; + x1[9] = input[18]; + x1[10] = input[10]; + x1[11] = input[26]; + x1[12] = input[6]; + x1[13] = input[22]; + x1[14] = input[14]; + x1[15] = input[30]; + x1[16] = input[1]; + x1[17] = input[17]; + x1[18] = input[9]; + x1[19] = input[25]; + x1[20] = input[5]; + x1[21] = input[21]; + x1[22] = input[13]; + x1[23] = input[29]; + x1[24] = input[3]; + x1[25] = input[19]; + x1[26] = input[11]; + x1[27] = input[27]; + x1[28] = input[7]; + x1[29] = input[23]; + x1[30] = input[15]; + x1[31] = input[31]; + + // stage 2 + btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, + INV_COS_BIT); + + // stage 3 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, + INV_COS_BIT); + idct32_high16_stage3_avx2(x1); + + // stage 4 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT); + + // stage 6 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + + idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT); + idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + idct32_stage9_avx2(output, x1); +} + +static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); +} + +static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[35]); + btf_16_adds_subs_avx2(&x[33], &x[34]); + btf_16_adds_subs_avx2(&x[39], &x[36]); + btf_16_adds_subs_avx2(&x[38], &x[37]); + btf_16_adds_subs_avx2(&x[40], &x[43]); + btf_16_adds_subs_avx2(&x[41], &x[42]); + btf_16_adds_subs_avx2(&x[47], &x[44]); + btf_16_adds_subs_avx2(&x[46], &x[45]); + btf_16_adds_subs_avx2(&x[48], &x[51]); + btf_16_adds_subs_avx2(&x[49], &x[50]); + btf_16_adds_subs_avx2(&x[55], &x[52]); + btf_16_adds_subs_avx2(&x[54], &x[53]); + btf_16_adds_subs_avx2(&x[56], &x[59]); + btf_16_adds_subs_avx2(&x[57], &x[58]); + btf_16_adds_subs_avx2(&x[63], &x[60]); + btf_16_adds_subs_avx2(&x[62], &x[61]); +} + +static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); +} + +static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); +} + +static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[39]); + btf_16_adds_subs_avx2(&x[33], &x[38]); + btf_16_adds_subs_avx2(&x[34], &x[37]); + btf_16_adds_subs_avx2(&x[35], &x[36]); + btf_16_adds_subs_avx2(&x[47], &x[40]); + btf_16_adds_subs_avx2(&x[46], &x[41]); + btf_16_adds_subs_avx2(&x[45], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[43]); + btf_16_adds_subs_avx2(&x[48], &x[55]); + btf_16_adds_subs_avx2(&x[49], &x[54]); + btf_16_adds_subs_avx2(&x[50], &x[53]); + btf_16_adds_subs_avx2(&x[51], &x[52]); + btf_16_adds_subs_avx2(&x[63], &x[56]); + btf_16_adds_subs_avx2(&x[62], &x[57]); + btf_16_adds_subs_avx2(&x[61], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[59]); +} + +static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); +} + +static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[47]); + btf_16_adds_subs_avx2(&x[33], &x[46]); + btf_16_adds_subs_avx2(&x[34], &x[45]); + btf_16_adds_subs_avx2(&x[35], &x[44]); + btf_16_adds_subs_avx2(&x[36], &x[43]); + btf_16_adds_subs_avx2(&x[37], &x[42]); + btf_16_adds_subs_avx2(&x[38], &x[41]); + btf_16_adds_subs_avx2(&x[39], &x[40]); + btf_16_adds_subs_avx2(&x[63], &x[48]); + btf_16_adds_subs_avx2(&x[62], &x[49]); + btf_16_adds_subs_avx2(&x[61], &x[50]); + btf_16_adds_subs_avx2(&x[60], &x[51]); + btf_16_adds_subs_avx2(&x[59], &x[52]); + btf_16_adds_subs_avx2(&x[58], &x[53]); + btf_16_adds_subs_avx2(&x[57], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[55]); +} + +static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[31]); + btf_16_adds_subs_avx2(&x[1], &x[30]); + btf_16_adds_subs_avx2(&x[2], &x[29]); + btf_16_adds_subs_avx2(&x[3], &x[28]); + btf_16_adds_subs_avx2(&x[4], &x[27]); + btf_16_adds_subs_avx2(&x[5], &x[26]); + btf_16_adds_subs_avx2(&x[6], &x[25]); + btf_16_adds_subs_avx2(&x[7], &x[24]); + btf_16_adds_subs_avx2(&x[8], &x[23]); + btf_16_adds_subs_avx2(&x[9], &x[22]); + btf_16_adds_subs_avx2(&x[10], &x[21]); + btf_16_adds_subs_avx2(&x[11], &x[20]); + btf_16_adds_subs_avx2(&x[12], &x[19]); + btf_16_adds_subs_avx2(&x[13], &x[18]); + btf_16_adds_subs_avx2(&x[14], &x[17]); + btf_16_adds_subs_avx2(&x[15], &x[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); +} + +static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); + btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); + btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); + btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); + btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); + btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); + btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); + btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); + btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); + btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); + btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); + btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); + btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); + btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); + btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); + btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); + btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); + btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); + btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); + btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); + btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); + btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); + btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); + btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); + btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); + btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); + btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); + btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); + btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); + btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); + btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); + btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); +} + +static void idct64_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, + INV_COS_BIT); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, + INV_COS_BIT); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, + INV_COS_BIT); + idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); + + idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage11_avx2(output, x); +} + +static void idct64_low16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, + INV_COS_BIT); + idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 8 + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, + INV_COS_BIT); + idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); + + idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage11_avx2(output, x); +} + +static void idct64_low32_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(&x[32], &x[33]); + btf_16_adds_subs_avx2(&x[35], &x[34]); + btf_16_adds_subs_avx2(&x[36], &x[37]); + btf_16_adds_subs_avx2(&x[39], &x[38]); + btf_16_adds_subs_avx2(&x[40], &x[41]); + btf_16_adds_subs_avx2(&x[43], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[45]); + btf_16_adds_subs_avx2(&x[47], &x[46]); + btf_16_adds_subs_avx2(&x[48], &x[49]); + btf_16_adds_subs_avx2(&x[51], &x[50]); + btf_16_adds_subs_avx2(&x[52], &x[53]); + btf_16_adds_subs_avx2(&x[55], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[57]); + btf_16_adds_subs_avx2(&x[59], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[61]); + btf_16_adds_subs_avx2(&x[63], &x[62]); + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); + idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, + INV_COS_BIT); + idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 7 + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 8 + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, + INV_COS_BIT); + idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 9~11 + idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage11_avx2(output, x); +} + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output); + +// 1D functions process 16 pixels at one time. +static const transform_1d_avx2 + lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, + idct64_low32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// only process w >= 16 h >= 16 +static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4; + const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); + for (int i = 0; i < buf_size_nonzero_h_div16; i++) { + __m256i buf0[64]; + load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + for (int j = 0; j < txfm_size_col; ++j) { + buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); + } + + __m256i *buf1_cur = buf1 + (i << 4); + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_avx2(buf0 + 16 * j, temp, 16); + int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); + transpose_16bit_16x16_avx2(temp, buf1_cur + offset); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); + } + } + } + const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i *buf1_cur = buf1 + i * txfm_size_row; + col_txfm(buf1_cur, buf1_cur); + for (int j = 0; j < txfm_size_row; ++j) { + buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); + } + } + for (int i = 0; i < buf_size_w_div16; i++) { + lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, + stride, ud_flip, txfm_size_row); + } +} + +static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); + const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } else { + const __m256i rect_scale = + _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + src = _mm256_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, + __m256i *buf, int shift, int height, + int txh_idx) { + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); + const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); + for (int h = 0; h < height; ++h) { + __m256i lo = _mm256_unpacklo_epi16(buf[h], one); + __m256i hi = _mm256_unpackhi_epi16(buf[h], one); + lo = _mm256_madd_epi16(lo, scale_coeff); + hi = _mm256_madd_epi16(hi, scale_coeff); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits); + lo = _mm256_add_epi32(lo, shift__r); + hi = _mm256_add_epi32(hi, shift__r); + lo = _mm256_srai_epi32(lo, -shift); + hi = _mm256_srai_epi32(hi, -shift); + const __m256i x = _mm256_packs_epi32(lo, hi); + write_recon_w16_avx2(x, output); + output += stride; + } +} + +static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_SIZE tx_size, + int32_t eob) { + (void)eob; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int col_max = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + __m256i buf[32]; + + for (int i = 0; i < (col_max >> 4); ++i) { + for (int j = 0; j < (row_max >> 4); j++) { + iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride, + row_max, shift[0], 16, txw_idx, rect_type); + transpose_16bit_16x16_avx2(buf, buf); + iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf, + shift[1], 16, txh_idx); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row); + const int input_stride = txfm_size_row_notzero; + const int buf_size_w_div16 = (eobx + 16) >> 4; + const int buf_size_h_div16 = (eoby + 16) >> 4; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i buf0[64]; + for (int j = 0; j < buf_size_h_div16; j++) { + __m256i *buf0_cur = buf0 + j * 16; + const int32_t *input_cur = input + i * 16 * input_stride + j * 16; + iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16, + txw_idx, rect_type); + transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); + } + col_txfm(buf0, buf0); + __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); + write_recon_w16_avx2(res, output + (i << 4) + j * stride); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_h_div16 = (eoby + 16) >> 4; + const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div16; i++) { + __m256i buf0[64]; + load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); + __m256i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_avx2(buf0 + 16 * j, temp, 16); + transpose_16bit_16x16_avx2(temp, + _buf1 + 16 * (buf_size_w_div16 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); + } + } + for (int j = 0; j < buf_size_w_div16; ++j) { + iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, + buf1 + j * 16, shift[1], 16, txh_idx); + } + } +} + +static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = { + { av1_idct8_low1_ssse3, av1_idct8_sse2 }, + { av1_iadst8_low1_ssse3, av1_iadst8_sse2 } +}; + +static INLINE void load_buffer_avx2(const int32_t *in, int stride, + __m128i *out) { + const __m256i a = _mm256_load_si256((const __m256i *)in); + const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1)); + const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2)); + const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3)); + const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4)); + const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5)); + const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6)); + const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7)); + + // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 + const __m256i ab_16bit = _mm256_packs_epi32(a, b); + // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7 + const __m256i cd_16bit = _mm256_packs_epi32(c, d); + // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7 + const __m256i ef_16bit = _mm256_packs_epi32(e, f); + // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7 + const __m256i gh_16bit = _mm256_packs_epi32(g, h); + + // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 + const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8); + // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7 + const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8); + // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7 + const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8); + // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7 + const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8); + + out[0] = _mm256_castsi256_si128(ab); + out[1] = _mm256_extractf128_si256(ab, 1); + out[2] = _mm256_castsi256_si128(cd); + out[3] = _mm256_extractf128_si256(cd, 1); + out[4] = _mm256_castsi256_si128(ef); + out[5] = _mm256_extractf128_si256(ef, 1); + out[6] = _mm256_castsi256_si128(gh); + out[7] = _mm256_extractf128_si256(gh, 1); +} + +static INLINE void round_and_transpose_avx2(const __m128i *const in, + __m128i *const out, int bit, + int *lr_flip) { + __m256i buf_temp[4]; + const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); + int j = *lr_flip ? 7 : 0; + const int step = *lr_flip ? -1 : 1; + + // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 + buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + j += step; + // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 + buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + j += step; + // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 + buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + j += step; + // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 + buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + + // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 + buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale); + // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 + buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale); + // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 + buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale); + // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 + buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale); + + // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23 + const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]); + // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27 + const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]); + // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03 + const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]); + // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07 + const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]); + + // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01 + const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1); + // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03 + const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1); + // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05 + const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1); + // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07 + const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1); + + // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01 + const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8); + // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03 + const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8); + // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05 + const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8); + // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07 + const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8); + + // 70 60 50 40 30 20 10 00 + out[0] = _mm256_castsi256_si128(reg_00); + // 71 61 51 41 31 21 11 01 + out[1] = _mm256_extracti128_si256(reg_00, 1); + // 72 62 52 42 32 22 12 02 + out[2] = _mm256_castsi256_si128(reg_01); + // 73 63 53 43 33 23 13 03 + out[3] = _mm256_extracti128_si256(reg_01, 1); + // 74 64 54 44 34 24 14 04 + out[4] = _mm256_castsi256_si128(reg_10); + // 75 65 55 45 35 25 15 05 + out[5] = _mm256_extracti128_si256(reg_10, 1); + // 76 66 56 46 36 26 16 06 + out[6] = _mm256_castsi256_si128(reg_11); + // 77 67 57 47 37 27 17 07 + out[7] = _mm256_extracti128_si256(reg_11, 1); +} + +static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit, + uint8_t *output, + int stride, int flipud) { + __m256i in_256[4], v_256[4]; + int j = flipud ? 7 : 0; + const int step = flipud ? -1 : 1; + const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); + const __m256i zero = _mm256_setzero_si256(); + // in[0], in[1] + in_256[0] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + j += 2 * step; + // in[2], in[3] + in_256[1] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + j += 2 * step; + // in[4], in[5] + in_256[2] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + j += 2 * step; + // in[6], in[7] + in_256[3] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + + // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17 + in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale); + // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37 + in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale); + // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57 + in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale); + // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77 + in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale); + + const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output)); + const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride)); + const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); + const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); + const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride)); + const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride)); + const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride)); + const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride)); + + v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1); + v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1); + v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1); + v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1); + + const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero); + const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero); + const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero); + const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero); + // 00 01 10 11 + const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0); + // 20 21 30 31 + const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1); + // 40 41 50 51 + const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2); + // 60 61 70 71 + const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3); + + // 00 01 20 21 10 11 30 31 + const __m256i res_0123 = _mm256_packus_epi16(x0, x1); + // 40 41 60 61 50 51 70 71 + const __m256i res_4567 = _mm256_packus_epi16(x2, x3); + + // 00 01 20 21 + const __m128i res_02 = _mm256_castsi256_si128(res_0123); + // 10 11 30 31 + const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1); + // 40 41 60 61 + const __m128i res_46 = _mm256_castsi256_si128(res_4567); + // 50 51 70 71 + const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1); + + // 00 01 + _mm_storel_epi64((__m128i *)(output), res_02); + // 10 11 + _mm_storel_epi64((__m128i *)(output + stride), res_13); + // 20 21 + _mm_storel_epi64((__m128i *)(output + 2 * stride), + _mm_unpackhi_epi64(res_02, res_02)); + // 30 31 + _mm_storel_epi64((__m128i *)(output + 3 * stride), + _mm_unpackhi_epi64(res_13, res_13)); + // 40 41 + _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46); + // 50 51 + _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57); + // 60 61 + _mm_storel_epi64((__m128i *)(output + 6 * stride), + _mm_unpackhi_epi64(res_46, res_46)); + // 70 71 + _mm_storel_epi64((__m128i *)(output + 7 * stride), + _mm_unpackhi_epi64(res_57, res_57)); +} + +// AVX2 implementation has the advantage when combined multiple operations +// together. +static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m128i buf1[8]; + const int input_stride = 8; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + assert(hitx_1d_tab[tx_type] < 2); + assert(vitx_1d_tab[tx_type] < 2); + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + __m128i buf0[8]; + __m128i *buf0_cur = buf0; + load_buffer_avx2(input, input_stride, buf0_cur); + row_txfm(buf0, buf0); + + assert(shift[0] < 0); + __m128i *_buf1 = buf1; + round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip); + assert(shift[1] < 0); + col_txfm(buf1, buf1); + round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip); +} + +// AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for +// tx_type with identity in either of the direction has no advantage. +static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); + + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + default: + lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + } +} + +// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: // ADST in vertical, DCT in horizontal + case DCT_ADST: // DCT in vertical, ADST in horizontal + case ADST_ADST: // ADST in both directions + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + default: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + case TX_4X8: + case TX_8X4: + case TX_8X16: + case TX_16X8: + case TX_4X16: + case TX_16X4: + case TX_8X32: + case TX_32X8: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_8X8: + lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X16: + case TX_32X32: + case TX_64X64: + case TX_16X32: + case TX_32X16: + case TX_32X64: + case TX_64X32: + case TX_16X64: + case TX_64X16: + default: + lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h new file mode 100644 index 0000000000..a09dea389f --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// half input is zero +#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \ + do { \ + const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \ + const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \ + const __m256i _in = in; \ + out0 = _mm256_mulhrs_epi16(_in, _w0); \ + out1 = _mm256_mulhrs_epi16(_in, _w1); \ + } while (0) + +static INLINE void round_shift_avx2(const __m256i *input, __m256i *output, + int size) { + const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8); + for (int i = 0; i < size; ++i) { + output[i] = _mm256_mulhrs_epi16(input[i], scale); + } +} + +static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) { + __m128i pred = _mm_loadu_si128((__m128i const *)(output)); + __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res); + __m128i y = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168)); + _mm_storeu_si128((__m128i *)(output), y); +} + +static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output, + int stride, int flipud, + int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + write_recon_w16_avx2(in[j], output + i * stride); + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob); +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c new file mode 100644 index 0000000000..79a6064c3e --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c @@ -0,0 +1,2904 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" + +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +// TODO(binpengsmail@gmail.com): replace some for loop with do {} while + +static void idct4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[4]; + x[0] = input[0]; + x[1] = input[2]; + x[2] = input[1]; + x[3] = input[3]; + + // stage 2 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + + // stage 3 + btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); + btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); +} + +static void idct4_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[4]; + x[0] = input[0]; + x[1] = input[2]; + x[2] = input[1]; + x[3] = input[3]; + + // stage 2 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + + // stage 3 + btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); + btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); +} + +void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 4 + // stage 5 + output[0] = x[0]; + output[7] = x[0]; + output[1] = x[1]; + output[6] = x[1]; + output[2] = x[1]; + output[5] = x[1]; + output[3] = x[0]; + output[4] = x[0]; +} + +void av1_idct8_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[0]; + x[1] = input[4]; + x[2] = input[2]; + x[3] = input[6]; + x[4] = input[1]; + x[5] = input[5]; + x[6] = input[3]; + x[7] = input[7]; + + // stage 2 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + + // stage 3 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + + // stage 4 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + + // stage 5 + btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); + btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); + btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); + btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); +} + +static void idct8_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[0]; + x[1] = input[4]; + x[2] = input[2]; + x[3] = input[6]; + x[4] = input[1]; + x[5] = input[5]; + x[6] = input[3]; + x[7] = input[7]; + + // stage 2 + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + + // stage 3 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + + // stage 4 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + + // stage 5 + btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); + btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); + btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); + btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); +} + +static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); +} + +static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); +} + +static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]); + btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]); + btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]); + btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]); + btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]); + btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]); + btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]); + btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); +} + +static void idct16_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 5 + // stage 6 + // stage 7 + output[0] = x[0]; + output[15] = x[0]; + output[1] = x[1]; + output[14] = x[1]; + output[2] = x[1]; + output[13] = x[1]; + output[3] = x[0]; + output[12] = x[0]; + output[4] = x[0]; + output[11] = x[0]; + output[5] = x[1]; + output[10] = x[1]; + output[6] = x[1]; + output[9] = x[1]; + output[7] = x[0]; + output[8] = x[0]; +} + +static void idct16_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[2] = input[4]; + x[4] = input[2]; + x[6] = input[6]; + x[8] = input[1]; + x[10] = input[5]; + x[12] = input[3]; + x[14] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + + // stage 3 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + idct16_stage5_sse2(x, cospi, __rounding, cos_bit); + idct16_stage6_sse2(x, cospi, __rounding, cos_bit); + idct16_stage7_sse2(output, x); +} + +static void idct16_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[1] = input[8]; + x[2] = input[4]; + x[3] = input[12]; + x[4] = input[2]; + x[5] = input[10]; + x[6] = input[6]; + x[7] = input[14]; + x[8] = input[1]; + x[9] = input[9]; + x[10] = input[5]; + x[11] = input[13]; + x[12] = input[3]; + x[13] = input[11]; + x[14] = input[7]; + x[15] = input[15]; + + // stage 2 + btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + + // stage 3 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + // stage 5~7 + idct16_stage5_sse2(x, cospi, __rounding, cos_bit); + idct16_stage6_sse2(x, cospi, __rounding, cos_bit); + idct16_stage7_sse2(output, x); +} + +static void idct16_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[1] = input[8]; + x[2] = input[4]; + x[3] = input[12]; + x[4] = input[2]; + x[5] = input[10]; + x[6] = input[6]; + x[7] = input[14]; + x[8] = input[1]; + x[9] = input[9]; + x[10] = input[5]; + x[11] = input[13]; + x[12] = input[3]; + x[13] = input[11]; + x[14] = input[7]; + x[15] = input[15]; + + // stage 2 + btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + + // stage 3 + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + + // stage 6 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + + // stage 7 + idct16_stage7_sse2(output, x); +} + +static INLINE void idct32_high16_stage3_sse2(__m128i *x) { + btf_16_adds_subs_sse2(x[16], x[17]); + btf_16_subs_adds_sse2(x[19], x[18]); + btf_16_adds_subs_sse2(x[20], x[21]); + btf_16_subs_adds_sse2(x[23], x[22]); + btf_16_adds_subs_sse2(x[24], x[25]); + btf_16_subs_adds_sse2(x[27], x[26]); + btf_16_adds_subs_sse2(x[28], x[29]); + btf_16_subs_adds_sse2(x[31], x[30]); +} + +static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); +} + +static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + btf_16_adds_subs_sse2(x[16], x[19]); + btf_16_adds_subs_sse2(x[17], x[18]); + btf_16_subs_adds_sse2(x[23], x[20]); + btf_16_subs_adds_sse2(x[22], x[21]); + btf_16_adds_subs_sse2(x[24], x[27]); + btf_16_adds_subs_sse2(x[25], x[26]); + btf_16_subs_adds_sse2(x[31], x[28]); + btf_16_subs_adds_sse2(x[30], x[29]); +} + +static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); +} + +static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + btf_16_adds_subs_sse2(x[16], x[23]); + btf_16_adds_subs_sse2(x[17], x[22]); + btf_16_adds_subs_sse2(x[18], x[21]); + btf_16_adds_subs_sse2(x[19], x[20]); + btf_16_subs_adds_sse2(x[31], x[24]); + btf_16_subs_adds_sse2(x[30], x[25]); + btf_16_subs_adds_sse2(x[29], x[26]); + btf_16_subs_adds_sse2(x[28], x[27]); +} + +static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[15]); + btf_16_adds_subs_sse2(x[1], x[14]); + btf_16_adds_subs_sse2(x[2], x[13]); + btf_16_adds_subs_sse2(x[3], x[12]); + btf_16_adds_subs_sse2(x[4], x[11]); + btf_16_adds_subs_sse2(x[5], x[10]); + btf_16_adds_subs_sse2(x[6], x[9]); + btf_16_adds_subs_sse2(x[7], x[8]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); +} + +static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]); + btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]); + btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]); + btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]); + btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]); + btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]); + btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]); + btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]); + btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]); + btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]); + btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]); + btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]); + btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]); + btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]); + btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]); + btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); +} + +static void idct32_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static void idct32_low16_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_sse2(x); + + // stage 4 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static void idct32_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[1] = input[16]; + x[2] = input[8]; + x[3] = input[24]; + x[4] = input[4]; + x[5] = input[20]; + x[6] = input[12]; + x[7] = input[28]; + x[8] = input[2]; + x[9] = input[18]; + x[10] = input[10]; + x[11] = input[26]; + x[12] = input[6]; + x[13] = input[22]; + x[14] = input[14]; + x[15] = input[30]; + x[16] = input[1]; + x[17] = input[17]; + x[18] = input[9]; + x[19] = input[25]; + x[20] = input[5]; + x[21] = input[21]; + x[22] = input[13]; + x[23] = input[29]; + x[24] = input[3]; + x[25] = input[19]; + x[26] = input[11]; + x[27] = input[27]; + x[28] = input[7]; + x[29] = input[23]; + x[30] = input[15]; + x[31] = input[31]; + + // stage 2 + btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]); + btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]); + + // stage 3 + btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + idct32_high16_stage3_sse2(x); + + // stage 4 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_adds_subs_sse2(x[7], x[6]); + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + // stage 7~8 + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); +} + +static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_adds_subs_sse2(x[32], x[35]); + btf_16_adds_subs_sse2(x[33], x[34]); + btf_16_subs_adds_sse2(x[39], x[36]); + btf_16_subs_adds_sse2(x[38], x[37]); + btf_16_adds_subs_sse2(x[40], x[43]); + btf_16_adds_subs_sse2(x[41], x[42]); + btf_16_subs_adds_sse2(x[47], x[44]); + btf_16_subs_adds_sse2(x[46], x[45]); + btf_16_adds_subs_sse2(x[48], x[51]); + btf_16_adds_subs_sse2(x[49], x[50]); + btf_16_subs_adds_sse2(x[55], x[52]); + btf_16_subs_adds_sse2(x[54], x[53]); + btf_16_adds_subs_sse2(x[56], x[59]); + btf_16_adds_subs_sse2(x[57], x[58]); + btf_16_subs_adds_sse2(x[63], x[60]); + btf_16_subs_adds_sse2(x[62], x[61]); +} + +static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); +} + +static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + btf_16_adds_subs_sse2(x[16], x[19]); + btf_16_adds_subs_sse2(x[17], x[18]); + btf_16_subs_adds_sse2(x[23], x[20]); + btf_16_subs_adds_sse2(x[22], x[21]); + btf_16_adds_subs_sse2(x[24], x[27]); + btf_16_adds_subs_sse2(x[25], x[26]); + btf_16_subs_adds_sse2(x[31], x[28]); + btf_16_subs_adds_sse2(x[30], x[29]); + idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); +} + +static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); + btf_16_adds_subs_sse2(x[32], x[39]); + btf_16_adds_subs_sse2(x[33], x[38]); + btf_16_adds_subs_sse2(x[34], x[37]); + btf_16_adds_subs_sse2(x[35], x[36]); + btf_16_subs_adds_sse2(x[47], x[40]); + btf_16_subs_adds_sse2(x[46], x[41]); + btf_16_subs_adds_sse2(x[45], x[42]); + btf_16_subs_adds_sse2(x[44], x[43]); + btf_16_adds_subs_sse2(x[48], x[55]); + btf_16_adds_subs_sse2(x[49], x[54]); + btf_16_adds_subs_sse2(x[50], x[53]); + btf_16_adds_subs_sse2(x[51], x[52]); + btf_16_subs_adds_sse2(x[63], x[56]); + btf_16_subs_adds_sse2(x[62], x[57]); + btf_16_subs_adds_sse2(x[61], x[58]); + btf_16_subs_adds_sse2(x[60], x[59]); +} + +static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_sse2(x[16], x[23]); + btf_16_adds_subs_sse2(x[17], x[22]); + btf_16_adds_subs_sse2(x[18], x[21]); + btf_16_adds_subs_sse2(x[19], x[20]); + btf_16_subs_adds_sse2(x[31], x[24]); + btf_16_subs_adds_sse2(x[30], x[25]); + btf_16_subs_adds_sse2(x[29], x[26]); + btf_16_subs_adds_sse2(x[28], x[27]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); +} + +static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[15]); + btf_16_adds_subs_sse2(x[1], x[14]); + btf_16_adds_subs_sse2(x[2], x[13]); + btf_16_adds_subs_sse2(x[3], x[12]); + btf_16_adds_subs_sse2(x[4], x[11]); + btf_16_adds_subs_sse2(x[5], x[10]); + btf_16_adds_subs_sse2(x[6], x[9]); + btf_16_adds_subs_sse2(x[7], x[8]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); + btf_16_adds_subs_sse2(x[32], x[47]); + btf_16_adds_subs_sse2(x[33], x[46]); + btf_16_adds_subs_sse2(x[34], x[45]); + btf_16_adds_subs_sse2(x[35], x[44]); + btf_16_adds_subs_sse2(x[36], x[43]); + btf_16_adds_subs_sse2(x[37], x[42]); + btf_16_adds_subs_sse2(x[38], x[41]); + btf_16_adds_subs_sse2(x[39], x[40]); + btf_16_subs_adds_sse2(x[63], x[48]); + btf_16_subs_adds_sse2(x[62], x[49]); + btf_16_subs_adds_sse2(x[61], x[50]); + btf_16_subs_adds_sse2(x[60], x[51]); + btf_16_subs_adds_sse2(x[59], x[52]); + btf_16_subs_adds_sse2(x[58], x[53]); + btf_16_subs_adds_sse2(x[57], x[54]); + btf_16_subs_adds_sse2(x[56], x[55]); +} + +static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[31]); + btf_16_adds_subs_sse2(x[1], x[30]); + btf_16_adds_subs_sse2(x[2], x[29]); + btf_16_adds_subs_sse2(x[3], x[28]); + btf_16_adds_subs_sse2(x[4], x[27]); + btf_16_adds_subs_sse2(x[5], x[26]); + btf_16_adds_subs_sse2(x[6], x[25]); + btf_16_adds_subs_sse2(x[7], x[24]); + btf_16_adds_subs_sse2(x[8], x[23]); + btf_16_adds_subs_sse2(x[9], x[22]); + btf_16_adds_subs_sse2(x[10], x[21]); + btf_16_adds_subs_sse2(x[11], x[20]); + btf_16_adds_subs_sse2(x[12], x[19]); + btf_16_adds_subs_sse2(x[13], x[18]); + btf_16_adds_subs_sse2(x[14], x[17]); + btf_16_adds_subs_sse2(x[15], x[16]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); +} + +static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]); + btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]); + btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]); + btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]); + btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]); + btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]); + btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]); + btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]); + btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]); + btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]); + btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]); + btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]); + btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]); + btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]); + btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]); + btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]); + btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]); + btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]); + btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]); + btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]); + btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]); + btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]); + btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]); + btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]); + btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]); + btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]); + btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]); + btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]); + btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]); + btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]); + btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]); + btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); +} + +static void idct64_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void idct64_low16_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void idct64_low32_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_sse2(x[32], x[33]); + btf_16_subs_adds_sse2(x[35], x[34]); + btf_16_adds_subs_sse2(x[36], x[37]); + btf_16_subs_adds_sse2(x[39], x[38]); + btf_16_adds_subs_sse2(x[40], x[41]); + btf_16_subs_adds_sse2(x[43], x[42]); + btf_16_adds_subs_sse2(x[44], x[45]); + btf_16_subs_adds_sse2(x[47], x[46]); + btf_16_adds_subs_sse2(x[48], x[49]); + btf_16_subs_adds_sse2(x[51], x[50]); + btf_16_adds_subs_sse2(x[52], x[53]); + btf_16_subs_adds_sse2(x[55], x[54]); + btf_16_adds_subs_sse2(x[56], x[57]); + btf_16_subs_adds_sse2(x[59], x[58]); + btf_16_adds_subs_sse2(x[60], x[61]); + btf_16_subs_adds_sse2(x[63], x[62]); + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_sse2(x[16], x[17]); + btf_16_subs_adds_sse2(x[19], x[18]); + btf_16_adds_subs_sse2(x[20], x[21]); + btf_16_subs_adds_sse2(x[23], x[22]); + btf_16_adds_subs_sse2(x[24], x[25]); + btf_16_subs_adds_sse2(x[27], x[26]); + btf_16_adds_subs_sse2(x[28], x[29]); + btf_16_subs_adds_sse2(x[31], x[30]); + idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 9~11 + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void iadst4_sse2(const __m128i *input, __m128i *output) { + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); + const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[4]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); + u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); + u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); + + __m128i x1[16]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); + x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); + x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); + x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); + x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); + x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 + x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); + x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 + x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02); + x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1 + x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01); + + __m128i x2[8]; + x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[5]); + x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4 + x2[3] = _mm_add_epi32(x1[3], x1[7]); + x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3 + x2[5] = _mm_add_epi32(x1[9], x1[11]); + x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1 + x2[7] = _mm_add_epi32(x1[13], x1[15]); + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); + __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + out1 = _mm_srai_epi32(out1, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out1); + } +} + +static void iadst4_w4_sse2(const __m128i *input, __m128i *output) { + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); + const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[2]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); + + __m128i x1[8]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 + x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 + x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1 + + __m128i x2[4]; + x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4 + x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3 + x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1 + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[i], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out0); + } +} + +void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[1] = input[0]; + + // stage 2 + btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]); + + // stage 3 + x[4] = x[0]; + x[5] = x[1]; + + // stage 4 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + + // stage 5 + x[2] = x[0]; + x[3] = x[1]; + x[6] = x[4]; + x[7] = x[5]; + + // stage 6 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +void av1_iadst8_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[7]; + x[1] = input[0]; + x[2] = input[5]; + x[3] = input[2]; + x[4] = input[3]; + x[5] = input[4]; + x[6] = input[1]; + x[7] = input[6]; + + // stage 2 + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); + + // stage 3 + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + + // stage 4 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + + // stage 6 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst8_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[7]; + x[1] = input[0]; + x[2] = input[5]; + x[3] = input[2]; + x[4] = input[3]; + x[5] = input[4]; + x[6] = input[1]; + x[7] = input[6]; + + // stage 2 + btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); + + // stage 3 + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + + // stage 4 + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + + // stage 6 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static INLINE void iadst16_stage3_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[8]); + btf_16_adds_subs_sse2(x[1], x[9]); + btf_16_adds_subs_sse2(x[2], x[10]); + btf_16_adds_subs_sse2(x[3], x[11]); + btf_16_adds_subs_sse2(x[4], x[12]); + btf_16_adds_subs_sse2(x[5], x[13]); + btf_16_adds_subs_sse2(x[6], x[14]); + btf_16_adds_subs_sse2(x[7], x[15]); +} + +static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage5_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + btf_16_adds_subs_sse2(x[8], x[12]); + btf_16_adds_subs_sse2(x[9], x[13]); + btf_16_adds_subs_sse2(x[10], x[14]); + btf_16_adds_subs_sse2(x[11], x[15]); +} + +static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage7_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + btf_16_adds_subs_sse2(x[8], x[10]); + btf_16_adds_subs_sse2(x[9], x[11]); + btf_16_adds_subs_sse2(x[12], x[14]); + btf_16_adds_subs_sse2(x[13], x[15]); +} + +static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { + const __m128i __zero = _mm_setzero_si128(); + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[8]); + output[2] = x[12]; + output[3] = _mm_subs_epi16(__zero, x[4]); + output[4] = x[6]; + output[5] = _mm_subs_epi16(__zero, x[14]); + output[6] = x[10]; + output[7] = _mm_subs_epi16(__zero, x[2]); + output[8] = x[3]; + output[9] = _mm_subs_epi16(__zero, x[11]); + output[10] = x[15]; + output[11] = _mm_subs_epi16(__zero, x[7]); + output[12] = x[5]; + output[13] = _mm_subs_epi16(__zero, x[13]); + output[14] = x[9]; + output[15] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[1] = input[0]; + + // stage 2 + btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); + + // stage 3 + x[8] = x[0]; + x[9] = x[1]; + + // stage 4 + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + + // stage 5 + x[4] = x[0]; + x[5] = x[1]; + x[12] = x[8]; + x[13] = x[9]; + + // stage 6 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + + // stage 7 + x[2] = x[0]; + x[3] = x[1]; + x[6] = x[4]; + x[7] = x[5]; + x[10] = x[8]; + x[11] = x[9]; + x[14] = x[12]; + x[15] = x[13]; + + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} + +static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[16]; + x[1] = input[0]; + x[3] = input[2]; + x[5] = input[4]; + x[7] = input[6]; + x[8] = input[7]; + x[10] = input[5]; + x[12] = input[3]; + x[14] = input[1]; + + // stage 2 + btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); + btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]); + btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]); + btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]); + btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]); + btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]); + btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]); + btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]); + + // stage 3 + iadst16_stage3_ssse3(x); + iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage5_ssse3(x); + iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage7_ssse3(x); + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} +static void iadst16_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x[16]; + x[0] = input[15]; + x[1] = input[0]; + x[2] = input[13]; + x[3] = input[2]; + x[4] = input[11]; + x[5] = input[4]; + x[6] = input[9]; + x[7] = input[6]; + x[8] = input[7]; + x[9] = input[8]; + x[10] = input[5]; + x[11] = input[10]; + x[12] = input[3]; + x[13] = input[12]; + x[14] = input[1]; + x[15] = input[14]; + + // stage 2 + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); + + // stage 3~9 + iadst16_stage3_ssse3(x); + iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage5_ssse3(x); + iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage7_ssse3(x); + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} + +static void iadst16_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[16]; + x[0] = input[15]; + x[1] = input[0]; + x[2] = input[13]; + x[3] = input[2]; + x[4] = input[11]; + x[5] = input[4]; + x[6] = input[9]; + x[7] = input[6]; + x[8] = input[7]; + x[9] = input[8]; + x[10] = input[5]; + x[11] = input[10]; + x[12] = input[3]; + x[13] = input[12]; + x[14] = input[1]; + x[15] = input[14]; + + // stage 2 + btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); + btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); + + // stage 3 + iadst16_stage3_ssse3(x); + + // stage 4 + btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); + + // stage 5 + iadst16_stage5_ssse3(x); + + // stage 6 + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); + + // stage 7 + iadst16_stage7_ssse3(x); + + // stage 8 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); + + // stage 9 + iadst16_stage9_ssse3(output, x); +} + +static void iidentity4_ssse3(const __m128i *input, __m128i *output) { + const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); + const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); + for (int i = 0; i < 4; ++i) { + __m128i x = _mm_mulhrs_epi16(input[i], scale); + output[i] = _mm_adds_epi16(x, input[i]); + } +} + +static void iidentity8_sse2(const __m128i *input, __m128i *output) { + for (int i = 0; i < 8; ++i) { + output[i] = _mm_adds_epi16(input[i], input[i]); + } +} + +static void iidentity16_ssse3(const __m128i *input, __m128i *output) { + const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); + const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); + for (int i = 0; i < 16; ++i) { + __m128i x = _mm_mulhrs_epi16(input[i], scale); + __m128i srcx2 = _mm_adds_epi16(input[i], input[i]); + output[i] = _mm_adds_epi16(x, srcx2); + } +} + +static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, + __m128i res) { + const __m128i zero = _mm_setzero_si128(); + __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero)); + return _mm_packus_epi16(x0, x0); +} + +static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride))); + __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); + u = _mm_packus_epi16(u, zero); + *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u); + } +} + +static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]); + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + +// 1D functions process process 8 pixels at one time. +static const transform_1d_ssse3 + lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_sse2, iadst4_sse2, iidentity4_ssse3 }, + { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 }, + { idct16_sse2, iadst16_sse2, iidentity16_ssse3 }, + { idct32_sse2, NULL, NULL }, + { idct64_low32_ssse3, NULL, NULL }, + }; + +// functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_1d_ssse3 + lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4_sse2, idct4_sse2, NULL, NULL }, + { iadst4_sse2, iadst4_sse2, NULL, NULL }, + { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL }, + }, + { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL }, + { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL }, + { iidentity8_sse2, iidentity8_sse2, NULL, NULL } }, + { + { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL }, + { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3, + idct32_sse2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3, + idct64_low32_ssse3 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// 1D functions process process 4 pixels at one time. +// used in 4x4, 4x8, 4x16, 8x4, 16x4 +static const transform_1d_ssse3 + lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 }, + { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 }, + { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 }, + { NULL, NULL, NULL }, + { NULL, NULL, NULL }, + }; + +static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]); + const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m128i src = load_32bit_to_16bit(input_row); + input_row += stride; + __m128i lo = _mm_unpacklo_epi16(src, one); + __m128i hi = _mm_unpackhi_epi16(src, one); + lo = _mm_madd_epi16(lo, scale_rounding); + hi = _mm_madd_epi16(hi, scale_rounding); + lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm_packs_epi32(lo, hi); + } + } else { + const __m128i rect_scale = + _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m128i src = load_32bit_to_16bit(input_row); + src = _mm_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m128i lo = _mm_unpacklo_epi16(src, one); + __m128i hi = _mm_unpackhi_epi16(src, one); + lo = _mm_madd_epi16(lo, scale_rounding); + hi = _mm_madd_epi16(hi, scale_rounding); + lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride, + __m128i *buf, int shift, int height, + int txh_idx) { + const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]); + const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1)); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding); + const __m128i zero = _mm_setzero_si128(); + for (int h = 0; h < height; ++h) { + __m128i lo = _mm_unpacklo_epi16(buf[h], one); + __m128i hi = _mm_unpackhi_epi16(buf[h], one); + lo = _mm_madd_epi16(lo, scale_coeff); + hi = _mm_madd_epi16(hi, scale_coeff); + lo = _mm_srai_epi32(lo, NewSqrt2Bits); + hi = _mm_srai_epi32(hi, NewSqrt2Bits); + lo = _mm_add_epi32(lo, shift_rounding); + hi = _mm_add_epi32(hi, shift_rounding); + lo = _mm_srai_epi32(lo, -shift); + hi = _mm_srai_epi32(hi, -shift); + __m128i x = _mm_packs_epi32(lo, hi); + + const __m128i pred = _mm_loadl_epi64((__m128i const *)(output)); + x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero)); + const __m128i u = _mm_packus_epi16(x, x); + _mm_storel_epi64((__m128i *)(output), u); + output += stride; + } +} + +void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_SIZE tx_size) { + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int col_max = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + for (int i = 0; i < (col_max >> 3); ++i) { + for (int j = 0; j < (row_max >> 3); j++) { + __m128i buf[8]; + iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride, + row_max, shift[0], 8, txw_idx, rect_type); + transpose_16bit_8x8(buf, buf); + iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf, + shift[1], 8, txh_idx); + } + } +} + +static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[4]; + const TX_SIZE tx_size = TX_4X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); + row_txfm(buf, buf); + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x4(temp, buf); + } else { + transpose_16bit_4x4(buf, buf); + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, + __m128i res0, __m128i res1) { + const __m128i zero = _mm_setzero_si128(); + __m128i x0 = _mm_unpacklo_epi8(pred, zero); + __m128i x1 = _mm_unpackhi_epi8(pred, zero); + x0 = _mm_adds_epi16(res0, x0); + x1 = _mm_adds_epi16(res1, x1); + return _mm_packus_epi16(x0, x1); +} + +static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]); + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output, + int size) { + const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8); + for (int i = 0; i < size; ++i) { + output[i] = _mm_mulhrs_epi16(input[i], scale); + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m128i buf1[64 * 8]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m128i buf0[64]; + load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); + __m128i *_buf1 = buf1 + i * 8; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + __m128i temp[8]; + flip_buf_sse2(buf0 + 8 * j, temp, 8); + transpose_16bit_8x8(temp, + _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row); + round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row); + } +} + +void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = (eobx + 8) >> 3; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + assert(fun_idx < 5); + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div8; i++) { + __m128i buf0[64]; + for (int j = 0; j < buf_size_h_div8; j++) { + __m128i *buf0_cur = buf0 + j * 8; + const int32_t *input_cur = input + i * 8 * input_stride + j * 8; + iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8, + txw_idx, rect_type); + transpose_16bit_8x8(buf0_cur, buf0_cur); + } + col_txfm(buf0, buf0); + __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + uint8_t *out = output + 8 * i; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + const __m128i v = _mm_loadl_epi64((__m128i const *)(out)); + __m128i res = _mm_mulhrs_epi16(buf0[k], mshift); + const __m128i u = lowbd_get_recon_8x8_sse2(v, res); + _mm_storel_epi64((__m128i *)(out), u); + out += stride; + } + } +} + +void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div8; i++) { + __m128i buf0[64]; + load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); + __m128i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + __m128i temp[8]; + flip_buf_sse2(buf0 + 8 * j, temp, 8); + transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j); + } + } + + for (int j = 0; j < buf_size_w_div8; ++j) { + iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride, + buf1 + j * 8, shift[1], 8, txh_idx); + } + } +} + +// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case DCT_DCT: + lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + default: + lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[8]; + const TX_SIZE tx_size = TX_4X8; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col); + round_shift_ssse3(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf); + // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0 + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf); + } else { + transpose_16bit_8x4(buf, buf); + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[8]; + const TX_SIZE tx_size = TX_8X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); + round_shift_ssse3(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf); + // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0 + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x8(temp, buf); + } else { + transpose_16bit_4x8(buf, buf); + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[16]; + const TX_SIZE tx_size = TX_4X16; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + const int row_one_loop = 8; + for (int i = 0; i < 2; ++i) { + const int32_t *input_cur = input + i * row_one_loop; + __m128i *buf_cur = buf + i * row_one_loop; + load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur, + txfm_size_col); + if (row_txfm == iidentity4_ssse3) { + const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 4; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf_cur, buf_cur); + round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); + } + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf_cur, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf_cur); + } else { + transpose_16bit_8x4(buf_cur, buf_cur); + } + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[16]; + const TX_SIZE tx_size = TX_16X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int row_one_loop = 8; + load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); + if (row_txfm == iidentity16_ssse3) { + const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 16; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); + } + if (lr_flip) { + __m128i temp[16]; + flip_buf_sse2(buf, temp, 16); + transpose_16bit_4x8(temp, buf); + transpose_16bit_4x8(temp + 8, buf + 8); + } else { + transpose_16bit_4x8(buf, buf); + transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf + i * row_one_loop, buf + i * row_one_loop); + round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]); + } + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); + lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); +} + +void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + default: + lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + if (!txfm_param->lossless) { + const TX_TYPE tx_type = txfm_param->tx_type; + av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h new file mode 100644 index 0000000000..1873d01bc0 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ + +#include // SSE2 +#include // SSSE3 + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define btf_16_ssse3(w0, w1, in, out0, out1) \ + do { \ + const __m128i _w0 = _mm_set1_epi16(w0 * 8); \ + const __m128i _w1 = _mm_set1_epi16(w1 * 8); \ + const __m128i _in = in; \ + out0 = _mm_mulhrs_epi16(_in, _w0); \ + out1 = _mm_mulhrs_epi16(_in, _w1); \ + } while (0) + +#define btf_16_adds_subs_sse2(in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + in0 = _mm_adds_epi16(_in0, _in1); \ + in1 = _mm_subs_epi16(_in0, _in1); \ + } while (0) + +#define btf_16_subs_adds_sse2(in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + in1 = _mm_subs_epi16(_in0, _in1); \ + in0 = _mm_adds_epi16(_in0, _in1); \ + } while (0) + +#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + out0 = _mm_adds_epi16(_in0, _in1); \ + out1 = _mm_subs_epi16(_in0, _in1); \ + } while (0) + +static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { + if (bit < 0) { + const __m128i scale = _mm_set1_epi16(1 << (15 + bit)); + for (int i = 0; i < size; ++i) { + in[i] = _mm_mulhrs_epi16(in[i], scale); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm_slli_epi16(in[i], bit); + } + } +} + +// 1D itx types +enum { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} UENUM1BYTE(ITX_TYPE_1D); + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob_fill[eob / (eoby_max + 1)]; + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output); + +void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob); + +void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_SIZE tx_size); + +void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob); +void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob); + +void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output); + +void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h new file mode 100644 index 0000000000..129721cf05 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void btf_16_w4_sse2( + const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, + const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, + __m128i *const out0, __m128i *const out1) { + const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i u0 = _mm_madd_epi16(t0, *w0); + const __m128i v0 = _mm_madd_epi16(t0, *w1); + const __m128i a0 = _mm_add_epi32(u0, __rounding); + const __m128i b0 = _mm_add_epi32(v0, __rounding); + const __m128i c0 = _mm_srai_epi32(a0, cos_bit); + const __m128i d0 = _mm_srai_epi32(b0, cos_bit); + + *out0 = _mm_packs_epi32(c0, c0); + *out1 = _mm_packs_epi32(d0, c0); +} + +#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ + do { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c0); \ + out1 = _mm_packs_epi32(d0, d0); \ + } while (0) + +#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ + do { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i u1 = _mm_madd_epi16(t1, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + __m128i v1 = _mm_madd_epi16(t1, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i a1 = _mm_add_epi32(u1, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + __m128i b1 = _mm_add_epi32(v1, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c1); \ + out1 = _mm_packs_epi32(d0, d1); \ + } while (0) + +static INLINE __m128i load_16bit_to_16bit(const int16_t *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i load_32bit_to_16bit(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +} + +static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, a_low); +} + +// Store 4 16 bit values. Sign extend the values. +static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { + const __m128i a_lo = _mm_unpacklo_epi16(a, a); + const __m128i a_1 = _mm_srai_epi32(a_lo, 16); + _mm_store_si128((__m128i *)b, a_1); +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) { + const __m128i a_lo = _mm_unpacklo_epi16(a, a); + const __m128i a_hi = _mm_unpackhi_epi16(a, a); + const __m128i a_1 = _mm_srai_epi32(a_lo, 16); + const __m128i a_2 = _mm_srai_epi32(a_hi, 16); + _mm_store_si128((__m128i *)b, a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +} + +static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) { + const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m128i b = _mm_madd_epi16(a, scale_rounding); + return _mm_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a, + int32_t *const b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_lo = _mm_unpacklo_epi16(a, one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + _mm_store_si128((__m128i *)b, b_lo); +} + +static INLINE void store_rect_16bit_to_32bit(const __m128i a, + int32_t *const b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_lo = _mm_unpacklo_epi16(a, one); + const __m128i a_hi = _mm_unpackhi_epi16(a, one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + _mm_store_si128((__m128i *)b, b_lo); + _mm_store_si128((__m128i *)(b + 4), b_hi); +} + +static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in, + const int stride, + __m128i *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, + const int stride, + __m128i *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w4(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_16bit_to_32bit_w4(in[i], out + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_16bit_to_32bit(in[i], out + i * stride); + } +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w4(in[i], out + i * stride); + } +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit(in[i], out + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in, + uint16_t *out, + const int stride) { + for (int i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)(out + i * stride), in[i]); + } +} + +static INLINE void round_shift_16bit(__m128i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm_adds_epi16(in[i], rounding); + in[i] = _mm_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm_slli_epi16(in[i], bit); + } + } +} + +static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, + int8_t cos_bit); + +void av1_iadst8_sse2(const __m128i *input, __m128i *output); + +void av1_idct8_sse2(const __m128i *input, __m128i *output); + +typedef struct { + transform_1d_sse2 col, row; // vertical and horizontal +} transform_2d_sse2; + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c new file mode 100644 index 0000000000..1894efdc10 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +// This function assumes `arr` is 16-byte aligned. +void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) { + __m128i *const vec = (__m128i *)arr; + const int vec_size = size >> 2; + av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit); +} diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h new file mode 100644 index 0000000000..387dfd6bb3 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) { + __m128i tmp, round; + round = _mm_set1_epi32(1 << (bit - 1)); + tmp = _mm_add_epi32(vec, round); + return _mm_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_sse4_1(const __m128i *input, + __m128i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_sse4_1(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_sse4_1(const __m128i *input, + __m128i *output, + const int size, + const int bit, + const int val) { + const __m128i sqrt2 = _mm_set1_epi32(val); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit); + const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m128i r0 = _mm_slli_epi32(input[i], -bit); + const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/cdef_block_avx2.c b/third_party/aom/av1/common/x86/cdef_block_avx2.c new file mode 100644 index 0000000000..1ec4b6c332 --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_avx2.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_avx2 +#include "av1/common/cdef_block_simd.h" + +// Mask used to shuffle the elements present in 256bit register. +const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504, + 0x0f0e0100, 0x0b0a0d0c, 0x07060908, + 0x03020504, 0x0f0e0100 }; + +/* partial A is a 16-bit vector of the form: +[x8 - - x1 | x16 - - x9] and partial B has the form: +[0 y1 - y7 | 0 y9 - y15]. +This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... +(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants +are in const1 and const2. */ +static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala, + __m256i *partialb, + const __m256i *const1, + const __m256i *const2) { + __m256i tmp; + /* Reverse partial B. */ + *partialb = _mm256_shuffle_epi8( + *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit)); + + /* Interleave the x and y values of identical indices and pair x8 with 0. */ + tmp = *partiala; + *partiala = _mm256_unpacklo_epi16(*partiala, *partialb); + *partialb = _mm256_unpackhi_epi16(tmp, *partialb); + + /* Square and add the corresponding x and y values. */ + *partiala = _mm256_madd_epi16(*partiala, *partiala); + *partialb = _mm256_madd_epi16(*partialb, *partialb); + /* Multiply by constant. */ + *partiala = _mm256_mullo_epi32(*partiala, *const1); + *partialb = _mm256_mullo_epi32(*partialb, *const2); + /* Sum all results. */ + *partiala = _mm256_add_epi32(*partiala, *partialb); + return *partiala; +} + +static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2, + __m256i *x3) { + const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1); + const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3); + const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1); + const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3); + + *x0 = _mm256_unpacklo_epi64(t0, t1); + *x1 = _mm256_unpackhi_epi64(t0, t1); + *x2 = _mm256_unpacklo_epi64(t2, t3); + *x3 = _mm256_unpackhi_epi64(t2, t3); + return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1), + _mm256_add_epi32(*x2, *x3)); +} + +/* Computes cost for directions 0, 5, 6 and 7. We can call this function again +to compute the remaining directions. */ +static INLINE __m256i compute_directions_avx2(__m256i *lines, + int32_t cost_frist_8x8[4], + int32_t cost_second_8x8[4]) { + __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; + __m256i partial6; + __m256i tmp; + /* Partial sums for lines 0 and 1. */ + partial4a = _mm256_slli_si256(lines[0], 14); + partial4b = _mm256_srli_si256(lines[0], 2); + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4)); + tmp = _mm256_add_epi16(lines[0], lines[1]); + partial5a = _mm256_slli_si256(tmp, 10); + partial5b = _mm256_srli_si256(tmp, 6); + partial7a = _mm256_slli_si256(tmp, 4); + partial7b = _mm256_srli_si256(tmp, 12); + partial6 = tmp; + + /* Partial sums for lines 2 and 3. */ + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6)); + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8)); + tmp = _mm256_add_epi16(lines[2], lines[3]); + partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8)); + partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8)); + partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6)); + partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10)); + partial6 = _mm256_add_epi16(partial6, tmp); + + /* Partial sums for lines 4 and 5. */ + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10)); + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12)); + tmp = _mm256_add_epi16(lines[4], lines[5]); + partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6)); + partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10)); + partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8)); + partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8)); + partial6 = _mm256_add_epi16(partial6, tmp); + + /* Partial sums for lines 6 and 7. */ + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14)); + partial4a = _mm256_add_epi16(partial4a, lines[7]); + tmp = _mm256_add_epi16(lines[6], lines[7]); + partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4)); + partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12)); + partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10)); + partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6)); + partial6 = _mm256_add_epi16(partial6, tmp); + + const __m256i const_reg_1 = + _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840); + const __m256i const_reg_2 = + _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168); + const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0); + const __m256i const_reg_4 = + _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140); + + /* Compute costs in terms of partial sums. */ + partial4a = + fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2); + partial7a = + fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4); + partial5a = + fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4); + partial6 = _mm256_madd_epi16(partial6, partial6); + partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105)); + + partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a); + _mm_storeu_si128((__m128i *)cost_frist_8x8, + _mm256_castsi256_si128(partial4a)); + _mm_storeu_si128((__m128i *)cost_second_8x8, + _mm256_extractf128_si256(partial4a, 1)); + + return partial4a; +} + +/* transpose and reverse the order of the lines -- equivalent to a 90-degree +counter-clockwise rotation of the pixels. */ +static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) { + const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]); + const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]); + const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]); + const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); + const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5); + const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); + const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5); + const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3); + const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); + const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3); + const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); + + res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1); + res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1); + res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3); + res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3); + res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5); + res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5); + res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7); + res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7); +} + +void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + int32_t cost_first_8x8[8]; + int32_t cost_second_8x8[8]; + // Used to store the best cost for 2 8x8's. + int32_t best_cost[2] = { 0 }; + // Best direction for 2 8x8's. + int best_dir[2] = { 0 }; + + const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift); + const __m256i const_128_reg = _mm256_set1_epi16(128); + __m256i lines[8]; + for (int i = 0; i < 8; i++) { + const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]); + const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]); + + lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1); + lines[i] = _mm256_sub_epi16( + _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg); + } + + /* Compute "mostly vertical" directions. */ + const __m256i dir47 = + compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4); + + /* Transpose and reverse the order of the lines. */ + array_reverse_transpose_8x8_avx2(lines, lines); + + /* Compute "mostly horizontal" directions. */ + const __m256i dir03 = + compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8); + + __m256i max = _mm256_max_epi32(dir03, dir47); + max = + _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8), + _mm256_slli_si256(max, 16 - (8)))); + max = + _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4), + _mm256_slli_si256(max, 16 - (4)))); + + const __m128i first_8x8_output = _mm256_castsi256_si128(max); + const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1); + const __m128i cmpeg_res_00 = + _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47)); + const __m128i cmpeg_res_01 = + _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03)); + const __m128i cmpeg_res_10 = + _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1)); + const __m128i cmpeg_res_11 = + _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1)); + const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00); + const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10); + + best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max)); + best_cost[1] = _mm_cvtsi128_si32(second_8x8_output); + best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8)); + best_dir[0] = + get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros + best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8)); + best_dir[1] = + get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros + + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7]; + *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7]; + + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var_out_1st >>= 10; + *var_out_2nd >>= 10; + *out_dir_1st_8x8 = best_dir[0]; + *out_dir_2nd_8x8 = best_dir[1]; +} + +void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j = 0; + int remaining_width = width; + assert(height % 2 == 0); + assert(height > 0); + assert(width > 0); + + // Process multiple 32 pixels at a time. + if (remaining_width > 31) { + int i = 0; + do { + j = 0; + do { + __m128i row00 = + _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]); + __m128i row01 = _mm_loadu_si128( + (const __m128i *)&src[(i + 0) * sstride + (j + 16)]); + __m128i row10 = + _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]); + __m128i row11 = _mm_loadu_si128( + (const __m128i *)&src[(i + 1) * sstride + (j + 16)]); + _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)], + _mm256_cvtepu8_epi16(row00)); + _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)], + _mm256_cvtepu8_epi16(row01)); + _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)], + _mm256_cvtepu8_epi16(row10)); + _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)], + _mm256_cvtepu8_epi16(row11)); + j += 32; + } while (j <= width - 32); + i += 2; + } while (i < height); + remaining_width = width & 31; + } + + // Process 16 pixels at a time. + if (remaining_width > 15) { + int i = 0; + do { + __m128i row0 = + _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]); + __m128i row1 = + _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]); + _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j], + _mm256_cvtepu8_epi16(row0)); + _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j], + _mm256_cvtepu8_epi16(row1)); + i += 2; + } while (i < height); + remaining_width = width & 15; + j += 16; + } + + // Process 8 pixels at a time. + if (remaining_width > 7) { + int i = 0; + do { + __m128i row0 = + _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]); + __m128i row1 = + _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]); + _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j], + _mm_unpacklo_epi8(row0, _mm_setzero_si128())); + _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j], + _mm_unpacklo_epi8(row1, _mm_setzero_si128())); + i += 2; + } while (i < height); + remaining_width = width & 7; + j += 8; + } + + // Process 4 pixels at a time. + if (remaining_width > 3) { + int i = 0; + do { + __m128i row0 = + _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j])); + __m128i row1 = + _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j])); + _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j], + _mm_unpacklo_epi8(row0, _mm_setzero_si128())); + _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j], + _mm_unpacklo_epi8(row1, _mm_setzero_si128())); + i += 2; + } while (i < height); + remaining_width = width & 3; + j += 4; + } + + // Process the remaining pixels. + if (remaining_width) { + for (int i = 0; i < height; i++) { + for (int k = j; k < width; k++) { + dst[i * dstride + k] = src[i * sstride + k]; + } + } + } +} diff --git a/third_party/aom/av1/common/x86/cdef_block_sse2.c b/third_party/aom/av1/common/x86/cdef_block_sse2.c new file mode 100644 index 0000000000..5ab7ffa2ff --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_sse2.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse2 +#include "av1/common/cdef_block_simd.h" + +void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j = 0; + for (int i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} diff --git a/third_party/aom/av1/common/x86/cdef_block_sse4.c b/third_party/aom/av1/common/x86/cdef_block_sse4.c new file mode 100644 index 0000000000..344c1e47c9 --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_sse4.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse4_1 +#include "av1/common/cdef_block_simd.h" + +void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j = 0; + for (int i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} diff --git a/third_party/aom/av1/common/x86/cdef_block_ssse3.c b/third_party/aom/av1/common/x86/cdef_block_ssse3.c new file mode 100644 index 0000000000..0fb36eb6e0 --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_ssse3.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_ssse3 +#include "av1/common/cdef_block_simd.h" + +void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j; + for (int i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c new file mode 100644 index 0000000000..e1e187c4a6 --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_avx2.c @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#include "av1/common/x86/cfl_simd.h" + +#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ + TX_SIZE tx_size) { \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + return subfn_##sub[tx_size]; \ + } + +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. + */ +static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos + const int luma_stride = input_stride << 1; + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); + + __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); + __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); + __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); + + _mm256_storeu_si256(row, sum_16x16); + + input += luma_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); + _mm256_storeu_si256(row, top_16x16); + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) + +/** + * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only + * performed on block of width 32. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + const __m256i zeros = _mm256_setzero_si256(); + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); + row_lo = _mm256_slli_epi16(row_lo, 3); + __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); + row_hi = _mm256_slli_epi16(row_hi, 3); + + _mm256_storeu_si256(row, row_lo); + _mm256_storeu_si256(row + 1, row_hi); + + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. + */ +static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const int luma_stride = input_stride << 1; + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); + __m256i sum = _mm256_add_epi16(top, bot); + + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); + __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); + + __m256i hsum = _mm256_hadd_epi16(sum, sum_1); + hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); + hsum = _mm256_add_epi16(hsum, hsum); + + _mm256_storeu_si256(row, hsum); + + input += luma_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + */ +static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + __m256i hsum = _mm256_hadd_epi16(top, top_1); + hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); + hsum = _mm256_slli_epi16(hsum, 2); + + _mm256_storeu_si256(row, hsum); + + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) + +static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); + _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, + __m256i alpha_sign, __m256i dc_q0) { + __m256i ac_q3 = _mm256_loadu_si256(input); + __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); + __m256i scaled_luma_q0 = + _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); + return _mm256_add_epi16(scaled_luma_q0, dc_q0); +} + +static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + (void)width; + const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); + const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); + const __m256i dc_q0 = _mm256_set1_epi16(*dst); + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + + do { + __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + res = _mm256_packus_epi16(res, next); + res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i *)dst, res); + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_PREDICT_X(avx2, 32, 8, lbd) +CFL_PREDICT_X(avx2, 32, 16, lbd) +CFL_PREDICT_X(avx2, 32, 32, lbd) + +cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { + static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { + cfl_predict_lbd_4x4_ssse3, /* 4x4 */ + cfl_predict_lbd_8x8_ssse3, /* 8x8 */ + cfl_predict_lbd_16x16_ssse3, /* 16x16 */ + cfl_predict_lbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_lbd_4x8_ssse3, /* 4x8 */ + cfl_predict_lbd_8x4_ssse3, /* 8x4 */ + cfl_predict_lbd_8x16_ssse3, /* 8x16 */ + cfl_predict_lbd_16x8_ssse3, /* 16x8 */ + cfl_predict_lbd_16x32_ssse3, /* 16x32 */ + cfl_predict_lbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_lbd_4x16_ssse3, /* 4x16 */ + cfl_predict_lbd_16x4_ssse3, /* 16x4 */ + cfl_predict_lbd_8x32_ssse3, /* 8x32 */ + cfl_predict_lbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the + // function pointer array out of bounds. + return pred[tx_size % TX_SIZES_ALL]; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static __m256i highbd_max_epi16(int bd) { + const __m256i neg_one = _mm256_set1_epi16(-1); + // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) + return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); +} + +static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { + return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); +} + +static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + // Use SSSE3 version for smaller widths + assert(width == 16 || width == 32); + const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); + const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); + const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); + const __m256i max = highbd_max_epi16(bd); + + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + _mm256_storeu_si256((__m256i *)dst, + highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); + if (width == 32) { + const __m256i res_1 = + predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + _mm256_storeu_si256( + (__m256i *)(dst + 16), + highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_PREDICT_X(avx2, 16, 4, hbd) +CFL_PREDICT_X(avx2, 16, 8, hbd) +CFL_PREDICT_X(avx2, 16, 16, hbd) +CFL_PREDICT_X(avx2, 16, 32, hbd) +CFL_PREDICT_X(avx2, 32, 8, hbd) +CFL_PREDICT_X(avx2, 32, 16, hbd) +CFL_PREDICT_X(avx2, 32, 32, hbd) + +cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { + static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { + cfl_predict_hbd_4x4_ssse3, /* 4x4 */ + cfl_predict_hbd_8x8_ssse3, /* 8x8 */ + cfl_predict_hbd_16x16_avx2, /* 16x16 */ + cfl_predict_hbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_hbd_4x8_ssse3, /* 4x8 */ + cfl_predict_hbd_8x4_ssse3, /* 8x4 */ + cfl_predict_hbd_8x16_ssse3, /* 8x16 */ + cfl_predict_hbd_16x8_avx2, /* 16x8 */ + cfl_predict_hbd_16x32_avx2, /* 16x32 */ + cfl_predict_hbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_hbd_4x16_ssse3, /* 4x16 */ + cfl_predict_hbd_16x4_avx2, /* 16x4 */ + cfl_predict_hbd_8x32_ssse3, /* 8x32 */ + cfl_predict_hbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the + // function pointer array out of bounds. + return pred[tx_size % TX_SIZES_ALL]; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Returns a vector where all the (32-bits) elements are the sum of all the +// lanes in a. +static INLINE __m256i fill_sum_epi32(__m256i a) { + // Given that a == [A, B, C, D, E, F, G, H] + a = _mm256_hadd_epi32(a, a); + // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H + // a == [A', C', A', C', E', G', E', G'] + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); + // a == [A', C', E', G', A', C', E', G'] + a = _mm256_hadd_epi32(a, a); + // Given that A'' == A' + C' and E'' == E' + G' + // a == [A'', E'', A'', E'', A'', E'', A'', E''] + return _mm256_hadd_epi32(a, a); + // Given that A''' == A'' + E'' + // a == [A''', A''', A''', A''', A''', A''', A''', A'''] +} + +static INLINE __m256i _mm256_addl_epi16(__m256i a) { + return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); +} + +static INLINE void subtract_average_avx2(const uint16_t *src_ptr, + int16_t *dst_ptr, int width, + int height, int round_offset, + int num_pel_log2) { + // Use SSE2 version for smaller widths + assert(width == 16 || width == 32); + + const __m256i *src = (__m256i *)src_ptr; + const __m256i *const end = src + height * CFL_BUF_LINE_I256; + // To maximize usage of the AVX2 registers, we sum two rows per loop + // iteration + const int step = 2 * CFL_BUF_LINE_I256; + + __m256i sum = _mm256_setzero_si256(); + // For width 32, we use a second sum accumulator to reduce accumulator + // dependencies in the loop. + __m256i sum2; + if (width == 32) sum2 = _mm256_setzero_si256(); + + do { + // Add top row to the bottom row + __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), + _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); + sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); + if (width == 32) { /* Don't worry, this if it gets optimized out. */ + // Add the second part of the top row to the second part of the bottom row + __m256i l1 = + _mm256_add_epi16(_mm256_loadu_si256(src + 1), + _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); + sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); + } + src += step; + } while (src < end); + // Combine both sum accumulators + if (width == 32) sum = _mm256_add_epi32(sum, sum2); + + __m256i fill = fill_sum_epi32(sum); + + __m256i avg_epi16 = _mm256_srli_epi32( + _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); + avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); + + // Store and subtract loop + src = (__m256i *)src_ptr; + __m256i *dst = (__m256i *)dst_ptr; + do { + _mm256_storeu_si256(dst, + _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); + if (width == 32) { + _mm256_storeu_si256( + dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); + } + src += CFL_BUF_LINE_I256; + dst += CFL_BUF_LINE_I256; + } while (src < end); +} + +// Declare wrappers for AVX2 sizes +CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) +CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) +CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) +CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) +CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) +CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) +CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) + +// Based on the observation that for small blocks AVX2 does not outperform +// SSE2, we call the SSE2 code for block widths 4 and 8. +cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { + cfl_subtract_average_4x4_sse2, /* 4x4 */ + cfl_subtract_average_8x8_sse2, /* 8x8 */ + cfl_subtract_average_16x16_avx2, /* 16x16 */ + cfl_subtract_average_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_sse2, /* 4x8 */ + cfl_subtract_average_8x4_sse2, /* 8x4 */ + cfl_subtract_average_8x16_sse2, /* 8x16 */ + cfl_subtract_average_16x8_avx2, /* 16x8 */ + cfl_subtract_average_16x32_avx2, /* 16x32 */ + cfl_subtract_average_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_sse2, /* 4x16 */ + cfl_subtract_average_16x4_avx2, /* 16x4 */ + cfl_subtract_average_8x32_sse2, /* 8x32 */ + cfl_subtract_average_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to + // index the function pointer array out of bounds. + return sub_avg[tx_size % TX_SIZES_ALL]; +} diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h new file mode 100644 index 0000000000..03ae02a922 --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_simd.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ +#define AOM_AV1_COMMON_X86_CFL_SIMD_H_ + +#include "av1/common/blockd.h" + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +#endif // CONFIG_AV1_HIGHBITDEPTH + +// SSE2 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); + +// SSE2 version is optimal for with == 8, we reuse them in AVX2 +void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); + +void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c new file mode 100644 index 0000000000..4783fe098c --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_sse2.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/cfl.h" +#include "config/av1_rtcd.h" + +static INLINE __m128i fill_sum_epi32(__m128i l0) { + l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2))); + return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1))); +} + +static INLINE void subtract_average_sse2(const uint16_t *src_ptr, + int16_t *dst_ptr, int width, + int height, int round_offset, + int num_pel_log2) { + const __m128i zeros = _mm_setzero_si128(); + const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset); + const __m128i *src = (__m128i *)src_ptr; + const __m128i *const end = src + height * CFL_BUF_LINE_I128; + const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4)); + + __m128i sum = zeros; + do { + __m128i l0; + if (width == 4) { + l0 = _mm_add_epi16(_mm_loadl_epi64(src), + _mm_loadl_epi64(src + CFL_BUF_LINE_I128)); + __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128), + _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128)); + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpacklo_epi16(l1, zeros))); + } else { + if (width == 8) { + l0 = _mm_add_epi16(_mm_loadu_si128(src), + _mm_loadu_si128(src + CFL_BUF_LINE_I128)); + } else { + l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1)); + } + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpackhi_epi16(l0, zeros))); + if (width == 32) { + l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3)); + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpackhi_epi16(l0, zeros))); + } + } + src += step; + } while (src < end); + + sum = fill_sum_epi32(sum); + + __m128i avg_epi16 = + _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2); + avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16); + + src = (__m128i *)src_ptr; + __m128i *dst = (__m128i *)dst_ptr; + do { + if (width == 4) { + _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16)); + } else { + _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16)); + if (width > 8) { + _mm_storeu_si128(dst + 1, + _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16)); + if (width == 32) { + _mm_storeu_si128(dst + 2, + _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16)); + _mm_storeu_si128(dst + 3, + _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16)); + } + } + } + src += CFL_BUF_LINE_I128; + dst += CFL_BUF_LINE_I128; + } while (src < end); +} + +CFL_SUB_AVG_FN(sse2) diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c new file mode 100644 index 0000000000..476b6609a9 --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_ssse3.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#include "av1/common/x86/cfl_simd.h" + +// Load 32-bit integer from memory into the first element of dst. +static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) { + return _mm_cvtsi32_si128(*((int *)mem_addr)); +} + +// Store 32-bit integer from the first element of a into memory. +static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) { + *((int *)mem_addr) = _mm_cvtsi128_si32(a); +} + +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i twos = _mm_set1_epi8(2); + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + __m128i top = _mm_loadh_epi32((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storeh_epi32(pred_buf_m128i, sum); + } else if (width == 8) { + __m128i top = _mm_loadl_epi64((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storel_epi64(pred_buf_m128i, sum); + } else { + __m128i top = _mm_loadu_si128((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storeu_si128(pred_buf_m128i, sum); + if (width == 32) { + __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + __m128i bot_1 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); + top_1 = _mm_maddubs_epi16(top_1, twos); + bot_1 = _mm_maddubs_epi16(bot_1, twos); + __m128i sum_1 = _mm_add_epi16(top_1, bot_1); + _mm_storeu_si128(pred_buf_m128i + 1, sum_1); + } + } + input += luma_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i fours = _mm_set1_epi8(4); + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + __m128i top = _mm_loadh_epi32((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storeh_epi32(pred_buf_m128i, top); + } else if (width == 8) { + __m128i top = _mm_loadl_epi64((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storel_epi64(pred_buf_m128i, top); + } else { + __m128i top = _mm_loadu_si128((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storeu_si128(pred_buf_m128i, top); + if (width == 32) { + __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + top_1 = _mm_maddubs_epi16(top_1, fours); + _mm_storeu_si128(pred_buf_m128i + 1, top_1); + } + } + input += input_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +/** + * Multiplies the pixels by 8 (scaling in Q3). + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i zeros = _mm_setzero_si128(); + const int luma_stride = input_stride; + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + __m128i row = _mm_loadh_epi32((__m128i *)input); + row = _mm_unpacklo_epi8(row, zeros); + _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); + } else if (width == 8) { + __m128i row = _mm_loadl_epi64((__m128i *)input); + row = _mm_unpacklo_epi8(row, zeros); + _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); + } else { + __m128i row = _mm_loadu_si128((__m128i *)input); + const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); + const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); + _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); + _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); + if (width == 32) { + __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); + const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); + _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); + _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); + } + } + input += luma_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +#if CONFIG_AV1_HIGHBITDEPTH +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const __m128i top = _mm_loadl_epi64((__m128i *)input); + const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); + __m128i sum = _mm_add_epi16(top, bot); + sum = _mm_hadd_epi16(sum, sum); + *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); + } else { + const __m128i top = _mm_loadu_si128((__m128i *)input); + const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); + __m128i sum = _mm_add_epi16(top, bot); + if (width == 8) { + sum = _mm_hadd_epi16(sum, sum); + _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); + } else { + const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i bot_1 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); + sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); + _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); + if (width == 32) { + const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); + const __m128i bot_2 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); + const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); + const __m128i bot_3 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); + const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); + const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); + __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, + _mm_add_epi16(next_sum, next_sum)); + } + } + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + const __m128i top = _mm_loadl_epi64((__m128i *)input); + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); + _mm_storeh_epi32(pred_buf_m128i, sum); + } else { + const __m128i top = _mm_loadu_si128((__m128i *)input); + if (width == 8) { + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); + _mm_storel_epi64(pred_buf_m128i, sum); + } else { + const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); + _mm_storeu_si128(pred_buf_m128i, sum); + if (width == 32) { + const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); + const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); + const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); + _mm_storeu_si128(pred_buf_m128i + 1, sum_1); + } + } + } + pred_buf_m128i += CFL_BUF_LINE_I128; + input += input_stride; + } while (pred_buf_m128i < end); +} + +static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); + _mm_storel_epi64((__m128i *)pred_buf_q3, row); + } else { + const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); + _mm_storeu_si128((__m128i *)pred_buf_q3, row); + if (width >= 16) { + __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); + row_1 = _mm_slli_epi16(row_1, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); + if (width == 32) { + __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); + row_2 = _mm_slli_epi16(row_2, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); + __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); + row_3 = _mm_slli_epi16(row_3, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); + } + } + } + input += input_stride; + pred_buf_q3 += CFL_BUF_LINE; + } while (pred_buf_q3 < end); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +CFL_GET_SUBSAMPLE_FUNCTION(ssse3) + +static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, + __m128i alpha_sign, __m128i dc_q0) { + __m128i ac_q3 = _mm_loadu_si128(input); + __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); + return _mm_add_epi16(scaled_luma_q0, dc_q0); +} + +static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + const __m128i dc_q0 = _mm_set1_epi16(*dst); + __m128i *row = (__m128i *)pred_buf_q3; + const __m128i *row_end = row + height * CFL_BUF_LINE_I128; + do { + __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + if (width < 16) { + res = _mm_packus_epi16(res, res); + if (width == 4) + _mm_storeh_epi32((__m128i *)dst, res); + else + _mm_storel_epi64((__m128i *)dst, res); + } else { + __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + res = _mm_packus_epi16(res, next); + _mm_storeu_si128((__m128i *)dst, res); + if (width == 32) { + res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); + next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); + res = _mm_packus_epi16(res, next); + _mm_storeu_si128((__m128i *)(dst + 16), res); + } + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I128) < row_end); +} + +CFL_PREDICT_FN(ssse3, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE __m128i highbd_max_epi16(int bd) { + const __m128i neg_one = _mm_set1_epi16(-1); + // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) + return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); +} + +static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { + return _mm_max_epi16(_mm_min_epi16(u, max), zero); +} + +static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + const __m128i dc_q0 = _mm_set1_epi16(*dst); + const __m128i max = highbd_max_epi16(bd); + const __m128i zeros = _mm_setzero_si128(); + __m128i *row = (__m128i *)pred_buf_q3; + const __m128i *row_end = row + height * CFL_BUF_LINE_I128; + do { + __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + res = highbd_clamp_epi16(res, zeros, max); + if (width == 4) { + _mm_storel_epi64((__m128i *)dst, res); + } else { + _mm_storeu_si128((__m128i *)dst, res); + } + if (width >= 16) { + const __m128i res_1 = + predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128(((__m128i *)dst) + 1, + highbd_clamp_epi16(res_1, zeros, max)); + } + if (width == 32) { + const __m128i res_2 = + predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128((__m128i *)(dst + 16), + highbd_clamp_epi16(res_2, zeros, max)); + const __m128i res_3 = + predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128((__m128i *)(dst + 24), + highbd_clamp_epi16(res_3, zeros, max)); + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I128) < row_end); +} + +CFL_PREDICT_FN(ssse3, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c new file mode 100644 index 0000000000..1b39a0a8d5 --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "third_party/SVT-AV1/convolve_2d_avx2.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" + +#include "av1/common/convolve.h" + +void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + if (filter_params_x->taps > 8) { + const int bd = 8; + int im_stride = 8, i; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h12 = _mm256_set1_epi32( + ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i sum_round_v = _mm256_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift_v = _mm_cvtsi32_si128(bits); + + __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 }; + + int horiz_tap = 12; + int vert_tap = 12; + + prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h); + prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v); + + int im_h = h + vert_tap - 1; + const int fo_vert = vert_tap / 2 - 1; + const int fo_horiz = horiz_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + for (int j = 0; j < w; j += 8) { + CONVOLVE_SR_HORIZONTAL_FILTER_12TAP + CONVOLVE_SR_VERTICAL_FILTER_12TAP + } + } else { + const int bd = 8; + int im_stride = 8, i; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h = + _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) + + (1 << (bd + FILTER_BITS - 2))); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + const __m256i sum_round_v = _mm256_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift_v = _mm_cvtsi32_si128(bits); + + __m256i filt[4], coeffs_h[4], coeffs_v[4]; + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); + + int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); + int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); + + if (horiz_tap == 6) + prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + else + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + + if (vert_tap == 6) + prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v); + else + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); + + int im_h = h + vert_tap - 1; + const int fo_vert = vert_tap / 2 - 1; + const int fo_horiz = horiz_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + for (int j = 0; j < w; j += 8) { + if (horiz_tap == 4) { + CONVOLVE_SR_HORIZONTAL_FILTER_4TAP + } else if (horiz_tap == 6) { + CONVOLVE_SR_HORIZONTAL_FILTER_6TAP + } else { + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP + } + + if (vert_tap == 4) { + CONVOLVE_SR_VERTICAL_FILTER_4TAP + } else if (vert_tap == 6) { + CONVOLVE_SR_VERTICAL_FILTER_6TAP + } else { + CONVOLVE_SR_VERTICAL_FILTER_8TAP + } + } + } +} + +void av1_convolve_2d_sr_avx2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int32_t w, int32_t h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4, + const int32_t subpel_y_q4, ConvolveParams *conv_params) { + const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4); + const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4); + + const bool use_general = (tap_x == 12 || tap_y == 12); + if (use_general) { + av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_q4, subpel_y_q4, conv_params); + } else { + av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_q4, subpel_y_q4, conv_params); + } +} diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c new file mode 100644 index 0000000000..1b85f37294 --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "av1/common/convolve.h" + +void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + __m128i coeffs[6]; + + /* Horizontal filter */ + { + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); + + const __m128i round_const = _mm_set1_epi32( + (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data_2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i src_6 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + const __m128i src_8 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero); + const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]); + const __m128i src_10 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero); + const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]); + + const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]); + const __m128i src_5 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]); + const __m128i src_7 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]); + const __m128i src_9 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero); + const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]); + const __m128i src_11 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero); + const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]); + + const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); + + const __m128i sum_round = + _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + const __m128i src_8 = + _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride), + *(__m128i *)(data + 9 * im_stride)); + const __m128i src_10 = + _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride), + *(__m128i *)(data + 11 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]); + const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]); + + const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + const __m128i src_9 = + _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride), + *(__m128i *)(data + 9 * im_stride)); + const __m128i src_11 = + _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride), + *(__m128i *)(data + 11 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]); + const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]); + const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]); + + const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); + + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + + _mm_storel_epi64(p, res); + } + } + } +} + +void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (filter_params_x->taps > 8) { + if (w < 8) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } else { + av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params); + } + } else { + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i sum_round = _mm_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); + + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + + if (w == 2) { + *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res); + } else if (w == 4) { + *(int *)p = _mm_cvtsi128_si32(res); + } else { + _mm_storel_epi64(p, res); + } + } + } + } + } +} + +void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, + int h, ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const __m128i zero = _mm_setzero_si128(); + const __m128i left_shift = _mm_cvtsi32_si128(bits); + int i, j; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + assert((w % 4) == 0); + + if (!(w % 16)) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); + + const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero); + const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero); + + const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift); + const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const); + + const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift); + const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const); + + if (do_average) { + const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j])); + const __m128i data_ref_0_hi = + _mm_loadu_si128((__m128i *)(&dst[j + 8])); + + const __m128i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + + const __m128i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result_hi = convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = + _mm_packus_epi16(round_result_lo, round_result_hi); + + _mm_store_si128((__m128i *)(&dst0[j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo); + _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi); + } + } + src += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + + const __m128i res = _mm_sll_epi16(d16_0, left_shift); + const __m128i res_unsigned = _mm_add_epi16(res, offset_const); + + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[j]), res_8); + else + *(int *)(&dst0[j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[j]), res_unsigned); + } + } + src += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } + } +} diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c new file mode 100644 index 0000000000..3862bbeac1 --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_avx2.c @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "third_party/SVT-AV1/convolve_avx2.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/synonyms.h" + +static AOM_INLINE void av1_convolve_y_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { + // right shift is F-1 because we are already dividing + // filter co-efficients by 2 + const int right_shift_bits = (FILTER_BITS - 1); + __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); + __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); + + __m256i coeffs[6], s[12]; + __m128i d[10]; + + int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); + + if (vert_tap == 6) + prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); + else if (vert_tap == 12) { + prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); + } else { + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); + } + + // vert_filt as 4 tap + if (vert_tap == 4) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (int j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else if (vert_tap == 6) { + const int fo_vert = vert_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (int j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); + const __m256i src_34a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_45a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[3] = s[4]; + s[4] = s[5]; + } + } + } else if (vert_tap == 12) { // vert_tap == 12 + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + const __m256i v_zero = _mm256_setzero_si256(); + right_shift = _mm_cvtsi32_si128(FILTER_BITS); + right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); + + for (int j = 0; j < w; j += 8) { + const uint8_t *data = &src_ptr[j]; + __m256i src10; + + d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); + d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); + d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); + + const __m256i src_67a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); + + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); + + const __m256i src_89a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); + + src10 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); + const __m256i src_910a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); + + const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); + const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); + const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); + const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); + const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); + const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); + const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); + const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); + const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); + const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); + + s[0] = _mm256_unpacklo_epi16(src_01, src_12); + s[1] = _mm256_unpacklo_epi16(src_23, src_34); + s[2] = _mm256_unpacklo_epi16(src_45, src_56); + s[3] = _mm256_unpacklo_epi16(src_67, src_78); + s[4] = _mm256_unpacklo_epi16(src_89, src_910); + + s[6] = _mm256_unpackhi_epi16(src_01, src_12); + s[7] = _mm256_unpackhi_epi16(src_23, src_34); + s[8] = _mm256_unpackhi_epi16(src_45, src_56); + s[9] = _mm256_unpackhi_epi16(src_67, src_78); + s[10] = _mm256_unpackhi_epi16(src_89, src_910); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_1011a = _mm256_permute2x128_si256( + src10, + _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), + 0x20); + + src10 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); + + const __m256i src_1112a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), + src10, 0x20); + + const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); + const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); + + s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); + s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); + + const __m256i res_lo = convolve_12taps(s, coeffs); + + const __m256i res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 4) { + const __m256i res_hi = convolve_12taps(s + 6, coeffs); + + const __m256i res_32b_hi = _mm256_sra_epi32( + _mm256_add_epi32(res_hi, right_shift_const), right_shift); + __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 2) { + *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); + *(int *)&dst[i * dst_stride + j + dst_stride] = + _mm_cvtsi128_si32(res_1); + } else { + *(uint16_t *)&dst[i * dst_stride + j] = + (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = + (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + + s[6] = s[7]; + s[7] = s[8]; + s[8] = s[9]; + s[9] = s[10]; + s[10] = s[11]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (int j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + + s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); + s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_67a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + const __m256i res_lo = convolve_lowbd(s, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t w, + int32_t h, + const InterpFilterParams *filter_params_y, + const int32_t subpel_y_q4) { + const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4); + + if (vert_tap == 12) { + av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_q4); + } else { + av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_q4); + } +} + +static AOM_INLINE void av1_convolve_x_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift = _mm_cvtsi32_si128(bits); + __m256i round_0_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); + int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + assert(conv_params->round_0 > 0); + + __m256i coeffs[6], filt[4]; + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + if (horiz_tap == 6) + prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); + else if (horiz_tap == 12) { + prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); + } else { + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + } + + // horz_filt as 4 tap + if (horiz_tap == 4) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } else if (horiz_tap == 6) { + const int fo_horiz = horiz_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } else if (horiz_tap == 12) { // horiz_tap == 12 + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + const __m256i v_zero = _mm256_setzero_si256(); + round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); + round_const = _mm256_set1_epi32((1 << bits) >> 1); + round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + __m256i s[6]; + + if (w <= 4) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + // row0 0..7 row1 0..7 + const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); + // row0 8..F row1 8..F + const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); + + // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 + const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); + // row0 04 04 .. 07 07 row1 04 04 .. 07 07 + const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); + + // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B + const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); + // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F + const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); + + // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 + s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); + // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 + s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); + // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 + s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); + // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A + s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); + // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C + s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); + // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E + s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); + + const __m256i res_lo = convolve_12taps(s, coeffs); + + __m256i res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_lo, round_0_const), round_0_shift); + + // 00 01 02 03 10 12 13 14 + res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), + round_shift); + // 8 bit conversion and saturation to uint8 + // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); + // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 + // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 + const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); + // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w > 2) { + // 00 01 02 03 + *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); + // 10 11 12 13 + *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); + } else { + // 00 01 + *(uint16_t *)&dst[i * dst_stride] = + (uint16_t)_mm_cvtsi128_si32(res_0); + // 10 11 + *(uint16_t *)&dst[i * dst_stride + dst_stride] = + (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; i++) { + for (int j = 0; j < w; j += 8) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + j + 4]))), + 0x20); + // row0 0..7 4..B + const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); + // row0 8..F C..13 + const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); + + // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 + const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); + // row0 04 04 .. 07 07 08 08 .. 0B 0B + const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); + + // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F + const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); + // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 + const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); + + s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); + s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); + s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); + s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); + s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); + s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); + + const __m256i res_lo = convolve_12taps(s, coeffs); + + __m256i res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_lo, round_0_const), round_0_shift); + + res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_32b_lo, round_const), round_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); + *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); + } + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } +} + +void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t w, + int32_t h, + const InterpFilterParams *filter_params_x, + const int32_t subpel_x_q4, + ConvolveParams *conv_params) { + const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4); + + if (horz_tap == 12) { + av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_q4, conv_params); + } else { + av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_q4, + conv_params); + } +} diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c new file mode 100644 index 0000000000..012e75c1ae --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_sse2.c @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "av1/common/convolve.h" + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1 + coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); + const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); + return d; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + int subpel_y_qn) { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); + __m128i coeffs[6]; + + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); + + int j = 0; + do { + __m128i s[12], src10, res_lo, res_hi; + __m128i res_lo_round, res_hi_round, res16, res; + const uint8_t *data = &src_ptr[j]; + + src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)); + s[0] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 6 * src_stride))); + s[6] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + s[7] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 8 * src_stride))); + s[8] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 9 * src_stride))); + s[9] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[10] = _mm_unpacklo_epi8( + src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))); + src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)); + s[11] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10); + + res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + s[8] = s[10]; + s[9] = s[11]; + } while (i < h); + j += 8; + } while (j < w); +} + +void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + if (filter_params_y->taps > 8) { + if (w < 8) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + } else { + av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); + __m128i coeffs[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); + + if (w <= 4) { + __m128i s[8], src6, res, res_round, res16; + int res_int; + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6); + + do { + s[6] = _mm_unpacklo_epi8( + src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride))); + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6); + + res = convolve_lo_y(s + 0, coeffs); + res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); + res16 = _mm_packs_epi32(res_round, res_round); + res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); + + if (w == 2) + *(uint16_t *)dst = (uint16_t)res_int; + else + *(int *)dst = res_int; + + src_ptr += src_stride; + dst += dst_stride; + + res = convolve_lo_y(s + 1, coeffs); + res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); + res16 = _mm_packs_epi32(res_round, res_round); + res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); + + if (w == 2) + *(uint16_t *)dst = (uint16_t)res_int; + else + *(int *)dst = res_int; + + src_ptr += src_stride; + dst += dst_stride; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + h -= 2; + } while (h); + } else { + assert(!(w % 8)); + int j = 0; + do { + __m128i s[8], src6, res_lo, res_hi; + __m128i res_lo_round, res_hi_round, res16, res; + const uint8_t *data = &src_ptr[j]; + + src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[6] = _mm_unpacklo_epi8( + src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); + + res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + } while (i < h); + j += 8; + } while (j < w); + } + } +} + +void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + int subpel_x_qn, + ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_0_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + const __m128i zero = _mm_setzero_si128(); + __m128i coeffs[6]; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); + + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + + const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero); + + __m128i res32_round = + _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift); + res32_round = + _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift); + + const __m128i res16 = _mm_packs_epi32(res32_round, zero); + const __m128i res = _mm_packus_epi16(res16, zero); + + const int val = _mm_cvtsi128_si32(res); + memcpy((dst + i * dst_stride + j), &val, sizeof(val)); + j += 4; + } while (j < w); + } while (++i < h); +} + +void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (filter_params_x->taps > 8) { + if (w < 4) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } else { + av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_0_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + __m128i coeffs[4]; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); + + if (w <= 4) { + do { + const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + const __m128i res_lo = convolve_lo_x(s, coeffs); + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + int r = _mm_cvtsi128_si32(res); + if (w == 2) + *(uint16_t *)dst = (uint16_t)r; + else + *(int *)dst = r; + + src_ptr += src_stride; + dst += dst_stride; + } while (--h); + } else { + assert(!(w % 8)); + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + // Filter even-index pixels + s[0] = data; + s[1] = _mm_srli_si128(data, 2); + s[2] = _mm_srli_si128(data, 4); + s[3] = _mm_srli_si128(data, 6); + const __m128i res_even = convolve_lo_x(s, coeffs); + + // Filter odd-index pixels + s[0] = _mm_srli_si128(data, 1); + s[1] = _mm_srli_si128(data, 3); + s[2] = _mm_srli_si128(data, 5); + s[3] = _mm_srli_si128(data, 7); + const __m128i res_odd = convolve_lo_x(s, coeffs); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + __m128i res_lo_round = _mm_sra_epi32( + _mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + __m128i res_hi_round = _mm_sra_epi32( + _mm_add_epi32(res_hi, round_0_const), round_0_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + j += 8; + } while (j < w); + } while (++i < h); + } + } +} diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c new file mode 100644 index 0000000000..d05bb0e15f --- /dev/null +++ b/third_party/aom/av1/common/x86/filterintra_sse4.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +//------------------------------------------------------------------------------ +// filter_intra_predictor_sse4_1 + +// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which +// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. +#define DUPLICATE_FIRST_HALF 0x44 + +// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th +// at zero to preserve the sum. +static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride, + const __m128i *pixels, + const __m128i *taps_0_1, + const __m128i *taps_2_3, + const __m128i *taps_4_5, + const __m128i *taps_6_7) { + const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1); + const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3); + // |output_half| contains 8 partial sums. + __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); + __m128i output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row0 = + _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4), + /* arbitrary pack arg */ output); + xx_storel_32(dst, output_row0); + const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5); + const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7); + output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); + output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row1 = + _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4), + /* arbitrary pack arg */ output); + xx_storel_32(dst + stride, output_row1); +} + +// 4xH transform sizes are given special treatment because xx_loadl_64 goes out +// of bounds and every block involves the left column. This implementation +// loads TL from the top row for the first block, so it is not +static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride, + const uint8_t *const top_ptr, + const uint8_t *const left_ptr, int mode, + const int height) { + const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]); + const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]); + const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]); + const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]); + __m128i top = xx_loadl_32(top_ptr - 1); + __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4); + __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr)); + left = _mm_slli_si128(left, 5); + + // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], + // left[2], left[3], left[4], left[5], left[6], left[7] + pixels = _mm_or_si128(left, pixels); + + // Duplicate first 8 bytes. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 1. + pixels = xx_loadl_32(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], + // left[0], left[1], ... + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last + // byte is an unused value, which shall be multiplied by 0 when we apply the + // filter. + const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; + + // Insert left[-1] in front as TL and put left[0] and left[1] at the end. + const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 2. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 3. + + // Compute the middle 8 rows before using common code for the final 4 rows. + // Because the common code below this block assumes that + if (height == 16) { + // This shift allows us to use pixel_order2 twice after shifting by 2 later. + left = _mm_slli_si128(left, 1); + pixels = xx_loadl_32(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], + // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The + // last byte is an unused value, as above. The top-left was shifted to + // position nine to keep two empty spaces after the top pixels. + const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; + + // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at + // the end. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 4. + + // First 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + + // Clear all but final pixel in the first 8 of left column. + __m128i keep_top_left = _mm_srli_si128(left, 13); + dest += stride; // Move to y = 5. + pixels = xx_loadl_32(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-6], + // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] + pixels = _mm_or_si128(left, pixels); + left = xx_loadl_64(left_ptr + 8); + + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 6. + + // Second 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + + // Position TL value so we can use pixel_order1. + keep_top_left = _mm_slli_si128(keep_top_left, 6); + dest += stride; // Move to y = 7. + pixels = xx_loadl_32(dest); + left = _mm_slli_si128(left, 7); + left = _mm_or_si128(left, keep_top_left); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 8. + + // Third 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 9. + + // Prepare final inputs. + pixels = xx_loadl_32(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 10. + + // Fourth 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 11. + } + + // In both the 8 and 16 case, we assume that the left vector has the next TL + // at position 8. + if (height > 4) { + // Erase prior left pixels by shifting TL to position 0. + left = _mm_srli_si128(left, 8); + left = _mm_slli_si128(left, 6); + pixels = xx_loadl_32(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 12 or 4. + + // First of final two 4x2 blocks. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 13 or 5. + pixels = xx_loadl_32(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 14 or 6. + + // Last of final two 4x2 blocks. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + } +} + +static INLINE void filter_intra_predictor_sse4_1(void *const dest, + ptrdiff_t stride, + const void *const top_row, + const void *const left_column, + int mode, const int width, + const int height) { + const uint8_t *const top_ptr = (const uint8_t *)top_row; + const uint8_t *const left_ptr = (const uint8_t *)left_column; + uint8_t *dst = (uint8_t *)dest; + if (width == 4) { + filter_4xh(dst, stride, top_ptr, left_ptr, mode, height); + return; + } + + // There is one set of 7 taps for each of the 4x2 output pixels. + const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]); + const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]); + const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]); + const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]); + + // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at + // the end is an unused value, which shall be multiplied by 0 when we apply + // the filter. + const int64_t kCondenseLeftMask = 0x0F09080403020100; + + // Takes the "left section" and puts it right after p0-p4. + const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); + + // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last + // byte is unused as above. + const int64_t kInsertTopLeftMask = 0x0F0A090302010008; + + // Shuffles the "top left" from the left section, to the front. Used when + // grabbing data from left_column and not top_row. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); + + // This first pass takes care of the cases where the top left pixel comes from + // top_row. + __m128i pixels = xx_loadl_64(top_ptr - 1); + __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8); + pixels = _mm_or_si128(pixels, left); + + // Two sets of the same pixels to multiply with two sets of taps. + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + left = _mm_srli_si128(left, 1); + + // Load + pixels = xx_loadl_32(dst + stride); + + // Because of the above shift, this OR 'invades' the final of the first 8 + // bytes of |pixels|. This is acceptable because the 8th filter tap is always + // a padded 0. + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + const ptrdiff_t stride2 = stride << 1; + const ptrdiff_t stride4 = stride << 2; + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + dst += 4; + for (int x = 3; x < width - 4; x += 4) { + pixels = xx_loadl_32(top_ptr + x); + pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + pixels = xx_loadl_32(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + dst += 4; + } + + // Now we handle heights that reference previous blocks rather than top_row. + for (int y = 4; y < height; y += 4) { + // Leftmost 4x4 block for this height. + dst -= width; + dst += stride4; + + // Top Left is not available by offset in these leftmost blocks. + pixels = xx_loadl_32(dst - stride); + left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8); + left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + + // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. + left = _mm_srli_si128(left, 2); + pixels = xx_loadl_32(dst + stride); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + + dst += 4; + + // Remaining 4x4 blocks for this height. + for (int x = 4; x < width; x += 4) { + pixels = xx_loadl_32(dst - stride - 1); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + pixels = xx_loadl_32(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + dst += 4; + } + } +} + +void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh); +} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c new file mode 100644 index 0000000000..d65318ccfa --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +void av1_highbd_convolve_2d_sr_ssse3( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); + +void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + if (filter_params_x->taps == 12) { + av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[8], coeffs_y[4], coeffs_x[4]; + + const __m256i round_const_x = _mm256_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i round_const_y = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = _mm256_setzero_si256(); + if (i + 1 < im_h) + row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + s[2] = _mm256_unpacklo_epi16(s4, s5); + + s[4] = _mm256_unpackhi_epi16(s0, s1); + s[5] = _mm256_unpackhi_epi16(s2, s3); + s[6] = _mm256_unpackhi_epi16(s4, s5); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + const __m256i res_a = convolve(s, coeffs_y); + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_y), round_shift_y); + + res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_y), round_shift_y); + res_b_round = + _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits), + round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32(&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c new file mode 100644 index 0000000000..89d7199f48 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "av1/common/convolve.h" + +void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src, + int src_stride, uint16_t *dst0, + int dst_stride0, int w, int h, + ConvolveParams *conv_params, + int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i zero = _mm_setzero_si128(); + int i, j; + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const __m128i offset_const_16b = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits <= 4); + + if (!(w % 8)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 8) { + const __m128i src_16bit = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); + const __m128i res = _mm_sll_epi16(src_16bit, left_shift); + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); + + const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero); + const __m128i res_unsigned_lo = + _mm_add_epi32(res_32b_lo, offset_const); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); + const __m128i res_unsigned_hi = + _mm_add_epi32(res_32b_hi, offset_const); + + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_unsigned_16b = + _mm_adds_epu16(res, offset_const_16b); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 4) { + const __m128i src_row_0 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); + const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1); + + const __m128i res = _mm_sll_epi16(src_10, left_shift); + + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i res_32b = _mm_unpacklo_epi16(res, zero); + const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const); + + const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); + const __m128i res_unsigned_hi = + _mm_add_epi32(res_32b_hi, offset_const); + + const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( + &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_1 = _mm_srli_si128(res_clip, 8); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m128i res_unsigned_16b = + _mm_adds_epu16(res, offset_const_16b); + + const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + + const __m128i res_unsigned_lo = + _mm_add_epi32(res_lo_round, offset_const); + + if (w < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); + + const __m128i comp_avg_res = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result = highbd_convolve_rounding_sse2( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result, round_result); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_16b = + _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } else { + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_unsigned_hi = + _mm_add_epi32(res_hi_round, offset_const); + + if (do_average) { + const __m128i data_lo = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_hi = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4])); + + const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); + const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = + highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi = + highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_16b = + _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c new file mode 100644 index 0000000000..88974ba260 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "av1/common/convolve.h" +#include "aom_dsp/x86/convolve_common_intrin.h" + +void av1_highbd_convolve_2d_sr_ssse3( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const __m128i round_const_x = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m128i round_const_y = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + if (filter_params_x->taps == 12) { + __m128i coeffs_x[6], coeffs_y[6], s[24]; + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + const __m128i row02 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + s[4] = _mm_alignr_epi8(row02, row01, 0); + s[5] = _mm_alignr_epi8(row02, row01, 4); + + __m128i res_even = convolve_12tap(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + s[4] = _mm_alignr_epi8(row02, row01, 2); + s[5] = _mm_alignr_epi8(row02, row01, 6); + + __m128i res_odd = convolve_12tap(s, coeffs_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + _mm_store_si128((__m128i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); + __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride)); + __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride)); + __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + s[3] = _mm_unpacklo_epi16(s6, s7); + s[4] = _mm_unpacklo_epi16(s8, s9); + + s[6] = _mm_unpackhi_epi16(s0, s1); + s[7] = _mm_unpackhi_epi16(s2, s3); + s[8] = _mm_unpackhi_epi16(s4, s5); + s[9] = _mm_unpackhi_epi16(s6, s7); + s[10] = _mm_unpackhi_epi16(s8, s9); + + s[12] = _mm_unpacklo_epi16(s1, s2); + s[13] = _mm_unpacklo_epi16(s3, s4); + s[14] = _mm_unpacklo_epi16(s5, s6); + s[15] = _mm_unpacklo_epi16(s7, s8); + s[16] = _mm_unpacklo_epi16(s9, s10); + + s[18] = _mm_unpackhi_epi16(s1, s2); + s[19] = _mm_unpackhi_epi16(s3, s4); + s[20] = _mm_unpackhi_epi16(s5, s6); + s[21] = _mm_unpackhi_epi16(s7, s8); + s[22] = _mm_unpackhi_epi16(s9, s10); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride)); + __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride)); + + s[5] = _mm_unpacklo_epi16(s10, s11); + s[11] = _mm_unpackhi_epi16(s10, s11); + + s[17] = _mm_unpacklo_epi16(s11, s12); + s[23] = _mm_unpackhi_epi16(s11, s12); + + const __m128i res_a0 = convolve_12tap(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_y), round_shift_y); + res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_y), round_shift_y); + res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_y), round_shift_y); + res_b_round0 = + _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits), + round_shift_bits); + + const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_y), round_shift_y); + res_b_round1 = + _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits), + round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((int *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((int *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + + s[6] = s[7]; + s[7] = s[8]; + s[8] = s[9]; + s[9] = s[10]; + s[10] = s[11]; + + s[12] = s[13]; + s[13] = s[14]; + s[14] = s[15]; + s[15] = s[16]; + s[16] = s[17]; + + s[18] = s[19]; + s[19] = s[20]; + s[20] = s[21]; + s[21] = s[22]; + s[22] = s[23]; + + s10 = s12; + } + } + } + } else { + __m128i coeffs_x[4], coeffs_y[4], s[16]; + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + _mm_store_si128((__m128i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_y), round_shift_y); + res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_y), round_shift_y); + res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_y), round_shift_y); + res_b_round0 = + _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits), + round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_y), round_shift_y); + res_b_round1 = + _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits), + round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((int *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((int *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c new file mode 100644 index 0000000000..cbfe5614c3 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c @@ -0,0 +1,4239 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +// Note: +// Total 32x4 registers to represent 32x32 block coefficients. +// For high bit depth, each coefficient is 4-byte. +// Each __m256i register holds 8 coefficients. +// So each "row" we needs 4 register. Totally 32 rows +// Register layout: +// v0, v1, v2, v3, +// v4, v5, v6, v7, +// ... ... +// v124, v125, v126, v127 + +static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(u, max); + clamped = _mm256_andnot_si256(mask, u); + mask = _mm256_and_si256(mask, max); + clamped = _mm256_or_si256(mask, clamped); + mask = _mm256_cmpgt_epi16(clamped, zero); + clamped = _mm256_and_si256(clamped, mask); + + return clamped; +} + +static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) { + if (shift != 0) { + __m256i rnding = _mm256_set1_epi32(1 << (shift - 1)); + in[0] = _mm256_add_epi32(in[0], rnding); + in[1] = _mm256_add_epi32(in[1], rnding); + in[2] = _mm256_add_epi32(in[2], rnding); + in[3] = _mm256_add_epi32(in[3], rnding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + } +} + +static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) { + round_shift_4x4_avx2(in, shift); + round_shift_4x4_avx2(in + 4, shift); + round_shift_4x4_avx2(in + 8, shift); + round_shift_4x4_avx2(in + 12, shift); +} + +static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out, + const __m256i *clamp_lo, + const __m256i *clamp_hi, int size) { + __m256i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm256_max_epi32(in[i], *clamp_lo); + out[i] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm256_min_epi32(a1, *clamp_hi); + + a0 = _mm256_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm256_min_epi32(a1, *clamp_hi); + } +} + +static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, + __m256i res0, __m256i res1, + const int bd) { + __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); + __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); + + x0 = _mm256_add_epi32(res0, x0); + x1 = _mm256_add_epi32(res1, x1); + x0 = _mm256_packus_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); + __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); + + _mm256_storeu_si256((__m256i *)(output + i * stride), u); + } +} +static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res, + const int bd) { + __m256i x0 = pred; + x0 = _mm256_add_epi32(res, x0); + x0 = _mm256_packus_epi32(x0, x0); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + __m128i temp; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m256i v = _mm256_cvtepi16_epi32(temp); + __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd); + __m128i u1 = _mm256_castsi256_si128(u); + _mm_storeu_si128((__m128i *)(output + i * stride), u1); + } +} +static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi, int shift) { + __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); + __m256i a0 = _mm256_add_epi32(offset, in0); + __m256i a1 = _mm256_sub_epi32(offset, in1); + + a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0], in[1]); + u1 = _mm256_unpackhi_epi32(in[0], in[1]); + + u2 = _mm256_unpacklo_epi32(in[2], in[3]); + u3 = _mm256_unpackhi_epi32(in[2], in[3]); + + u4 = _mm256_unpacklo_epi32(in[4], in[5]); + u5 = _mm256_unpackhi_epi32(in[4], in[5]); + + u6 = _mm256_unpacklo_epi32(in[6], in[7]); + u7 = _mm256_unpackhi_epi32(in[6], in[7]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + +static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[7], in[6]); + u1 = _mm256_unpackhi_epi32(in[7], in[6]); + + u2 = _mm256_unpacklo_epi32(in[5], in[4]); + u3 = _mm256_unpackhi_epi32(in[5], in[4]); + + u4 = _mm256_unpacklo_epi32(in[3], in[2]); + u5 = _mm256_unpackhi_epi32(in[3], in[2]); + + u6 = _mm256_unpacklo_epi32(in[1], in[0]); + u7 = _mm256_unpackhi_epi32(in[1], in[0]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m256i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + } +} + +static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *rounding, int bit) { + __m256i x; + x = _mm256_mullo_epi32(*w0, *n0); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} + +static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} + +static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi) { + __m256i a0 = _mm256_add_epi32(in0, in1); + __m256i a1 = _mm256_sub_epi32(in0, in1); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static INLINE void idct32_stage4_avx2( + __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, + const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, + const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_avx2( + __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, + const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, + const __m256i *clamp_hi, const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_avx2( + __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, + const int do_cols, const int bd, + const int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rounding); + x = _mm256_srai_epi32(x, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(offset, x); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; +} + +static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + + addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); + + addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32], bf0[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); + + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = + half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } + } +} +static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + { + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm256_mullo_epi32(in[0], cospi32); + in[0] = _mm256_add_epi32(in[0], rnding); + in[0] = _mm256_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm256_add_epi32(in[0], offset); + in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + in[0] = _mm256_max_epi32(in[0], clamp_lo); + in[0] = _mm256_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; + } +} + +static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit); + + addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[5], cospi32); + y = _mm256_mullo_epi32(u[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + u[10] = _mm256_sub_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[13] = _mm256_add_epi32(x, y); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_add_epi32(x, y); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + // stage 7 + addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + y = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(x, y); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(x, y); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm256_mullo_epi32(v[5], cospi32); + y = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_sub_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_add_epi32(x, y); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_add_epi32(x, y); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i v[16], x, y, temp1, temp2; + + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(x, rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(zero, x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm256_mullo_epi32(v[8], cospi8); + x = _mm256_mullo_epi32(v[9], cospi56); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[8], cospi56); + x = _mm256_mullo_epi32(v[9], cospi8); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm256_mullo_epi32(v[12], cospi16); + x = _mm256_mullo_epi32(v[13], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[12], cospi48); + x = _mm256_mullo_epi32(v[13], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + y = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + y = _mm256_mullo_epi32(v[10], cospi32); + x = _mm256_mullo_epi32(v[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + y = _mm256_mullo_epi32(v[14], cospi32); + x = _mm256_mullo_epi32(v[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + __m256i zero = _mm256_setzero_si256(); + x = _mm256_mullo_epi32(in[0], cospi62); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + u[1] = _mm256_sub_epi32(zero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + x = _mm256_mullo_epi32(in[2], cospi54); + u[2] = _mm256_add_epi32(x, rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + x = _mm256_mullo_epi32(in[2], cospi10); + u[3] = _mm256_sub_epi32(zero, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + x = _mm256_mullo_epi32(in[4], cospi46); + u[4] = _mm256_add_epi32(x, rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(in[4], cospi18); + u[5] = _mm256_sub_epi32(zero, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(in[6], cospi38); + u[6] = _mm256_add_epi32(x, rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(in[6], cospi26); + u[7] = _mm256_sub_epi32(zero, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[8] = _mm256_mullo_epi32(in[7], cospi34); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + u[9] = _mm256_mullo_epi32(in[7], cospi30); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + u[10] = _mm256_mullo_epi32(in[5], cospi42); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_mullo_epi32(in[5], cospi22); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_mullo_epi32(in[3], cospi50); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + u[13] = _mm256_mullo_epi32(in[3], cospi14); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + u[14] = _mm256_mullo_epi32(in[1], cospi58); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_mullo_epi32(in[1], cospi6); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 3 + addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi56); + u[8] = _mm256_mullo_epi32(u[8], cospi8); + u[8] = _mm256_add_epi32(u[8], x); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + x = _mm256_mullo_epi32(u[9], cospi8); + u[9] = _mm256_sub_epi32(y, x); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + x = _mm256_mullo_epi32(u[11], cospi24); + y = _mm256_mullo_epi32(u[10], cospi24); + u[10] = _mm256_mullo_epi32(u[10], cospi40); + u[10] = _mm256_add_epi32(u[10], x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + x = _mm256_mullo_epi32(u[11], cospi40); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + x = _mm256_mullo_epi32(u[13], cospi8); + y = _mm256_mullo_epi32(u[12], cospi8); + u[12] = _mm256_mullo_epi32(u[12], cospim56); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospim56); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi40); + y = _mm256_mullo_epi32(u[14], cospi40); + u[14] = _mm256_mullo_epi32(u[14], cospim24); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim24); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 5 + addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm256_mullo_epi32(u[5], cospi48); + y = _mm256_mullo_epi32(u[4], cospi48); + u[4] = _mm256_mullo_epi32(u[4], cospi16); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(u[5], cospi16); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(u[7], cospi16); + y = _mm256_mullo_epi32(u[6], cospi16); + u[6] = _mm256_mullo_epi32(u[6], cospim48); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(u[7], cospim48); + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + x = _mm256_mullo_epi32(u[13], cospi48); + y = _mm256_mullo_epi32(u[12], cospi48); + u[12] = _mm256_mullo_epi32(u[12], cospi16); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospi16); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi16); + y = _mm256_mullo_epi32(u[14], cospi16); + u[14] = _mm256_mullo_epi32(u[14], cospim48); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim48); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 7 + addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + u[2] = _mm256_add_epi32(y, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(y, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + u[10] = _mm256_add_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + u[14] = _mm256_add_epi32(y, x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]); + out[2] = u[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]); + out[4] = u[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]); + out[6] = u[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]); + out[8] = u[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]); + out[10] = u[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]); + out[12] = u[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]); + out[14] = u[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm256_mullo_epi32(in[15], cospi2); + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(v[0], x); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_mullo_epi32(in[15], cospi62); + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(v[1], x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(in[13], cospi10); + x = _mm256_mullo_epi32(in[2], cospi54); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(in[13], cospi54); + x = _mm256_mullo_epi32(in[2], cospi10); + v[3] = _mm256_sub_epi32(v[3], x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_mullo_epi32(in[11], cospi18); + x = _mm256_mullo_epi32(in[4], cospi46); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(in[11], cospi46); + x = _mm256_mullo_epi32(in[4], cospi18); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(in[9], cospi26); + x = _mm256_mullo_epi32(in[6], cospi38); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(in[9], cospi38); + x = _mm256_mullo_epi32(in[6], cospi26); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = _mm256_mullo_epi32(in[7], cospi34); + x = _mm256_mullo_epi32(in[8], cospi30); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(in[7], cospi30); + x = _mm256_mullo_epi32(in[8], cospi34); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(in[5], cospi42); + x = _mm256_mullo_epi32(in[10], cospi22); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(in[5], cospi22); + x = _mm256_mullo_epi32(in[10], cospi42); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(in[3], cospi50); + x = _mm256_mullo_epi32(in[12], cospi14); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(in[3], cospi14); + x = _mm256_mullo_epi32(in[12], cospi50); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(in[1], cospi58); + x = _mm256_mullo_epi32(in[14], cospi6); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(in[1], cospi6); + x = _mm256_mullo_epi32(in[14], cospi58); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi8); + x = _mm256_mullo_epi32(u[9], cospi56); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi8); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi40); + x = _mm256_mullo_epi32(u[11], cospi24); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(u[10], cospi24); + x = _mm256_mullo_epi32(u[11], cospi40); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[12], cospim56); + x = _mm256_mullo_epi32(u[13], cospi8); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi8); + x = _mm256_mullo_epi32(u[13], cospim56); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim24); + x = _mm256_mullo_epi32(u[15], cospi40); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi40); + x = _mm256_mullo_epi32(u[15], cospim24); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 5 + addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm256_mullo_epi32(u[4], cospi16); + x = _mm256_mullo_epi32(u[5], cospi48); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(u[4], cospi48); + x = _mm256_mullo_epi32(u[5], cospi16); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(u[6], cospim48); + x = _mm256_mullo_epi32(u[7], cospi16); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(u[6], cospi16); + x = _mm256_mullo_epi32(u[7], cospim48); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm256_mullo_epi32(u[12], cospi16); + x = _mm256_mullo_epi32(u[13], cospi48); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi48); + x = _mm256_mullo_epi32(u[13], cospi16); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim48); + x = _mm256_mullo_epi32(u[15], cospi16); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi16); + x = _mm256_mullo_epi32(u[15], cospim48); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 7 + addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} +static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rnding); + x = _mm256_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} +static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm256_mullo_epi32(in[1], cospi56); + y = _mm256_mullo_epi32(in[7], cospim8); + u4 = _mm256_add_epi32(x, y); + u4 = _mm256_add_epi32(u4, rnding); + u4 = _mm256_srai_epi32(u4, bit); + + x = _mm256_mullo_epi32(in[1], cospi8); + y = _mm256_mullo_epi32(in[7], cospi56); + u7 = _mm256_add_epi32(x, y); + u7 = _mm256_add_epi32(u7, rnding); + u7 = _mm256_srai_epi32(u7, bit); + + x = _mm256_mullo_epi32(in[5], cospi24); + y = _mm256_mullo_epi32(in[3], cospim40); + u5 = _mm256_add_epi32(x, y); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + x = _mm256_mullo_epi32(in[5], cospi40); + y = _mm256_mullo_epi32(in[3], cospi24); + u6 = _mm256_add_epi32(x, y); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + // stage 3 + x = _mm256_mullo_epi32(u0, cospi32); + y = _mm256_mullo_epi32(u1, cospi32); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + v1 = _mm256_sub_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi48); + y = _mm256_mullo_epi32(u3, cospim16); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi16); + y = _mm256_mullo_epi32(u3, cospi48); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm256_mullo_epi32(v5, cospi32); + y = _mm256_mullo_epi32(v6, cospi32); + u6 = _mm256_add_epi32(y, x); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + u5 = _mm256_sub_epi32(y, x); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4_avx2(out, out_shift); + round_shift_4x4_avx2(out + 4, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} +static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + __m256i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(kZero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m256i temp1, temp2; + temp1 = _mm256_mullo_epi32(u[0], cospi16); + x = _mm256_mullo_epi32(u[1], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm256_mullo_epi32(u[0], cospi48); + x = _mm256_mullo_epi32(u[1], cospi16); + u[5] = _mm256_sub_epi32(temp2, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm256_mullo_epi32(u[0], cospi32); + x = _mm256_mullo_epi32(u[1], cospi32); + u[2] = _mm256_add_epi32(temp1, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(temp1, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + temp1 = _mm256_mullo_epi32(u[4], cospi32); + x = _mm256_mullo_epi32(u[5], cospi32); + u[6] = _mm256_add_epi32(temp1, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(temp1, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm256_mullo_epi32(in[7], cospi4); + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(u[0], x); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_mullo_epi32(in[7], cospi60); + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(u[1], x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + u[2] = _mm256_mullo_epi32(in[5], cospi20); + x = _mm256_mullo_epi32(in[2], cospi44); + u[2] = _mm256_add_epi32(u[2], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_mullo_epi32(in[5], cospi44); + x = _mm256_mullo_epi32(in[2], cospi20); + u[3] = _mm256_sub_epi32(u[3], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_mullo_epi32(in[3], cospi36); + x = _mm256_mullo_epi32(in[4], cospi28); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(in[3], cospi28); + x = _mm256_mullo_epi32(in[4], cospi36); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(in[1], cospi52); + x = _mm256_mullo_epi32(in[6], cospi12); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(in[1], cospi12); + x = _mm256_mullo_epi32(in[6], cospi52); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 3 + addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[6], cospim48); + x = _mm256_mullo_epi32(v[7], cospi16); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(v[6], cospi16); + x = _mm256_mullo_epi32(v[7], cospim48); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 5 + addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + u[2] = _mm256_add_epi32(v[0], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(v[0], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + v[0] = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + u[6] = _mm256_add_epi32(v[0], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(v[0], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} +static INLINE void idct64_stage8_avx2( + __m256i *u, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + __m256i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols, + int bd, int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } +} + +static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + + { + __m256i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} +static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + + { + __m256i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m256i temp1, temp2; + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64]; + __m256i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi17 = _mm256_set1_epi32(cospi[17]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi19 = _mm256_set1_epi32(cospi[19]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi21 = _mm256_set1_epi32(cospi[21]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi23 = _mm256_set1_epi32(cospi[23]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi25 = _mm256_set1_epi32(cospi[25]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi27 = _mm256_set1_epi32(cospi[27]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi29 = _mm256_set1_epi32(cospi[29]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi31 = _mm256_set1_epi32(cospi[31]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi35 = _mm256_set1_epi32(cospi[35]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi39 = _mm256_set1_epi32(cospi[39]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi43 = _mm256_set1_epi32(cospi[43]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi47 = _mm256_set1_epi32(cospi[47]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } + } +} +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, + int do_cols, int bd, int out_shift); + +static const transform_1d_avx2 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { + { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL }, + { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m256i buf1[64 * 8]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m256i buf0[64]; + load_buffer_32bit_input(input + i * 8, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m256i *_buf1 = buf1 + i * 8; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_flip_avx2( + &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_avx2(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row, bd); + } + } else if (txfm_size_col == 8) { + highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row, + bd); + } +} + +void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, + tx_size, eob, bd); + break; + default: assert(0); break; + } +} +void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); + break; + default: + av1_highbd_inv_txfm2d_add_universe_avx2( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); + break; + } +} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c new file mode 100644 index 0000000000..4ff6a90f95 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -0,0 +1,5830 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_txfm_sse4.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" + +static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(u, max); + clamped = _mm_andnot_si128(mask, u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + mask = _mm_cmpgt_epi16(clamped, zero); + clamped = _mm_and_si128(clamped, mask); + + return clamped; +} + +static INLINE void round_shift_4x4(__m128i *in, int shift) { + if (shift != 0) { + __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); + in[0] = _mm_add_epi32(in[0], rnding); + in[1] = _mm_add_epi32(in[1], rnding); + in[2] = _mm_add_epi32(in[2], rnding); + in[3] = _mm_add_epi32(in[3], rnding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + } +} + +static void round_shift_8x8(__m128i *in, int shift) { + round_shift_4x4(&in[0], shift); + round_shift_4x4(&in[4], shift); + round_shift_4x4(&in[8], shift); + round_shift_4x4(&in[12], shift); +} + +static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int size) { + __m128i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm_max_epi32(in[i], *clamp_lo); + out[i] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm_min_epi32(a1, *clamp_hi); + + a0 = _mm_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm_min_epi32(a1, *clamp_hi); + } +} + +static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, + __m128i res0, __m128i res1, + const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); + __m128i min_clip_val = _mm_setzero_si128(); + __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1); + x0 = _mm_add_epi32(res0, x0); + x1 = _mm_add_epi32(res1, x1); + x0 = _mm_max_epi32(x0, min_clip_val); + x0 = _mm_min_epi32(x0, max_clip_val); + x1 = _mm_max_epi32(x1, min_clip_val); + x1 = _mm_min_epi32(x1, max_clip_val); + x0 = _mm_packus_epi32(x0, x1); + return x0; +} + +static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred, + __m128i res0, const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + + x0 = _mm_add_epi32(res0, x0); + x0 = _mm_packus_epi32(x0, x0); + x0 = highbd_clamp_epi16(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd); + + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + +static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); + + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); +} + +void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + __m128i op[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + load_buffer_4x4(input, op); + + // Shift before-hand. + op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT); + op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT); + op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT); + op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT); + + for (int i = 0; i < 2; ++i) { + __m128i a1 = op[0]; + __m128i c1 = op[1]; + __m128i d1 = op[2]; + __m128i b1 = op[3]; + a1 = _mm_add_epi32(a1, c1); // a1 += c1 + d1 = _mm_sub_epi32(d1, b1); // d1 -= b1 + __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 + e1 = _mm_srai_epi32(e1, 1); + b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 + c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 + a1 = _mm_sub_epi32(a1, b1); // a1 -= b1 + d1 = _mm_add_epi32(d1, c1); // d1 += c1 + + op[0] = a1; + op[1] = b1; + op[2] = c1; + op[3] = d1; + if (i == 0) { + transpose_32bit_4x4(op, op); + } + } + + // Convert to int16_t. The C code checks that we are in range. + op[0] = _mm_packs_epi32(op[0], op[1]); + op[1] = _mm_packs_epi32(op[2], op[3]); + + // Load uint16_t. + __m128i dst[2]; + __m128i tmp[4]; + tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); + dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]); + tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); + tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); + dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]); + + // Add to the previous results. + dst[0] = _mm_add_epi16(dst[0], op[0]); + dst[1] = _mm_add_epi16(dst[1], op[1]); + + // Clamp. + dst[0] = highbd_clamp_epi16(dst[0], bd); + dst[1] = highbd_clamp_epi16(dst[1], bd); + + // Store. + _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]); + dst[0] = _mm_srli_si128(dst[0], 8); + _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]); + _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]); + dst[1] = _mm_srli_si128(dst[1], 8); + _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]); +} + +static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, + __m128i *out1, const __m128i *clamp_lo, + const __m128i *clamp_hi) { + __m128i a0 = _mm_add_epi32(in0, in1); + __m128i a1 = _mm_sub_epi32(in0, in1); + + a0 = _mm_max_epi32(a0, *clamp_lo); + a0 = _mm_min_epi32(a0, *clamp_hi); + a1 = _mm_max_epi32(a1, *clamp_lo); + a1 = _mm_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int shift) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i in0_w_offset = _mm_add_epi32(*in0, offset); + __m128i in1_w_offset = _mm_add_epi32(*in1, offset); + + in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift)); + in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift)); + + in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo); + in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi); + in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo); + in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi); + + *in0 = in0_w_offset; + *in1 = in1_w_offset; +} + +static INLINE void idct32_stage4_sse4_1( + __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, + const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, + const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = + half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_sse4_1( + __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, + const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, + const __m128i *clamp_hi, const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = + half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_sse4_1( + __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = + half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = + half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = + half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = + half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = + half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, + const int do_cols, const int bd, + const int out_shift, + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 32; i += 8) { + round_shift_4x4(out + i, out_shift); + round_shift_4x4(out + i + 4, out_shift); + } + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, + __m128i *out0, __m128i *out1, + const __m128i *clamp_lo, const __m128i *clamp_hi, + int shift) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i a0 = _mm_add_epi32(offset, in0); + __m128i a1 = _mm_sub_epi32(offset, in1); + + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm_max_epi32(a0, *clamp_lo); + a0 = _mm_min_epi32(a0, *clamp_hi); + a1 = _mm_max_epi32(a1, *clamp_lo); + a1 = _mm_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3, x, y; + + // Stage 0 + // Stage 1 + // Stage 2 + u0 = in[0]; + u1 = in[1]; + u2 = in[2]; + u3 = in[3]; + + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u2, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u1, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u1, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + // Stage 3 + addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); + + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift); + shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift); + } +} + +static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *sinpi = sinpi_arr(bit); + const __m128i zero = _mm_setzero_si128(); + __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1)); + rnding = _mm_unpacklo_epi32(rnding, zero); + const __m128i mul = _mm_set1_epi32(1 << 4); + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; + __m128i u0, u1, u2, u3; + __m128i u0_low, u1_low, u2_low, u3_low; + __m128i u0_high, u1_high, u2_high, u3_high; + + x0 = in[0]; + x1 = in[1]; + x2 = in[2]; + x3 = in[3]; + + s0 = _mm_mullo_epi32(x0, sinpi1); + s1 = _mm_mullo_epi32(x0, sinpi2); + s2 = _mm_mullo_epi32(x1, sinpi3); + s3 = _mm_mullo_epi32(x2, sinpi4); + s4 = _mm_mullo_epi32(x2, sinpi1); + s5 = _mm_mullo_epi32(x3, sinpi2); + s6 = _mm_mullo_epi32(x3, sinpi4); + t = _mm_sub_epi32(x0, x2); + s7 = _mm_add_epi32(t, x3); + + t = _mm_add_epi32(s0, s3); + s0 = _mm_add_epi32(t, s5); + t = _mm_sub_epi32(s1, s4); + s1 = _mm_sub_epi32(t, s6); + s3 = s2; + s2 = _mm_mullo_epi32(s7, sinpi3); + + u0 = _mm_add_epi32(s0, s3); + u1 = _mm_add_epi32(s1, s3); + u2 = s2; + t = _mm_add_epi32(s0, s1); + u3 = _mm_sub_epi32(t, s3); + + // u0 + u0_low = _mm_mul_epi32(u0, mul); + u0_low = _mm_add_epi64(u0_low, rnding); + + u0 = _mm_srli_si128(u0, 4); + u0_high = _mm_mul_epi32(u0, mul); + u0_high = _mm_add_epi64(u0_high, rnding); + + u0_low = _mm_srli_si128(u0_low, 2); + u0_high = _mm_srli_si128(u0_high, 2); + + u0 = _mm_unpacklo_epi32(u0_low, u0_high); + u0_high = _mm_unpackhi_epi32(u0_low, u0_high); + u0 = _mm_unpacklo_epi64(u0, u0_high); + + // u1 + u1_low = _mm_mul_epi32(u1, mul); + u1_low = _mm_add_epi64(u1_low, rnding); + + u1 = _mm_srli_si128(u1, 4); + u1_high = _mm_mul_epi32(u1, mul); + u1_high = _mm_add_epi64(u1_high, rnding); + + u1_low = _mm_srli_si128(u1_low, 2); + u1_high = _mm_srli_si128(u1_high, 2); + + u1 = _mm_unpacklo_epi32(u1_low, u1_high); + u1_high = _mm_unpackhi_epi32(u1_low, u1_high); + u1 = _mm_unpacklo_epi64(u1, u1_high); + + // u2 + u2_low = _mm_mul_epi32(u2, mul); + u2_low = _mm_add_epi64(u2_low, rnding); + + u2 = _mm_srli_si128(u2, 4); + u2_high = _mm_mul_epi32(u2, mul); + u2_high = _mm_add_epi64(u2_high, rnding); + + u2_low = _mm_srli_si128(u2_low, 2); + u2_high = _mm_srli_si128(u2_high, 2); + + u2 = _mm_unpacklo_epi32(u2_low, u2_high); + u2_high = _mm_unpackhi_epi32(u2_low, u2_high); + u2 = _mm_unpacklo_epi64(u2, u2_high); + + // u3 + u3_low = _mm_mul_epi32(u3, mul); + u3_low = _mm_add_epi64(u3_low, rnding); + + u3 = _mm_srli_si128(u3, 4); + u3_high = _mm_mul_epi32(u3, mul); + u3_high = _mm_add_epi64(u3_high, rnding); + + u3_low = _mm_srli_si128(u3_low, 2); + u3_high = _mm_srli_si128(u3_high, 2); + + u3 = _mm_unpacklo_epi32(u3_low, u3_high); + u3_high = _mm_unpackhi_epi32(u3_low, u3_high); + u3 = _mm_unpacklo_epi64(u3, u3_high); + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + round_shift_4x4(in, shift); + + v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); + v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); + v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); + v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); + + v0 = _mm_unpacklo_epi16(v0, zero); + v1 = _mm_unpacklo_epi16(v1, zero); + v2 = _mm_unpacklo_epi16(v2, zero); + v3 = _mm_unpacklo_epi16(v3, zero); + + if (fliplr) { + in[0] = _mm_shuffle_epi32(in[0], 0x1B); + in[1] = _mm_shuffle_epi32(in[1], 0x1B); + in[2] = _mm_shuffle_epi32(in[2], 0x1B); + in[3] = _mm_shuffle_epi32(in[3], 0x1B); + } + + if (flipud) { + u0 = _mm_add_epi32(in[3], v0); + u1 = _mm_add_epi32(in[2], v1); + u2 = _mm_add_epi32(in[1], v2); + u3 = _mm_add_epi32(in[0], v3); + } else { + u0 = _mm_add_epi32(in[0], v0); + u1 = _mm_add_epi32(in[1], v1); + u2 = _mm_add_epi32(in[2], v2); + u3 = _mm_add_epi32(in[3], v3); + } + + v0 = _mm_packus_epi32(u0, u1); + v2 = _mm_packus_epi32(u2, u3); + + u0 = highbd_clamp_epi16(v0, bd); + u2 = highbd_clamp_epi16(v2, bd); + + v0 = _mm_unpacklo_epi64(u0, u0); + v1 = _mm_unpackhi_epi64(u0, u0); + v2 = _mm_unpacklo_epi64(u2, u2); + v3 = _mm_unpackhi_epi64(u2, u2); + + _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); + _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); + _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); + _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); +} + +static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i zero = _mm_setzero_si128(); + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a1_low; + __m128i a0_high, a1_high; + + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 4; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } +} +void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[4]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case IDTX: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case H_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + default: assert(0); + } +} + +// 8x8 +static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); + in[4] = _mm_load_si128((const __m128i *)(coeff + 16)); + in[5] = _mm_load_si128((const __m128i *)(coeff + 20)); + in[6] = _mm_load_si128((const __m128i *)(coeff + 24)); + in[7] = _mm_load_si128((const __m128i *)(coeff + 28)); + in[8] = _mm_load_si128((const __m128i *)(coeff + 32)); + in[9] = _mm_load_si128((const __m128i *)(coeff + 36)); + in[10] = _mm_load_si128((const __m128i *)(coeff + 40)); + in[11] = _mm_load_si128((const __m128i *)(coeff + 44)); + in[12] = _mm_load_si128((const __m128i *)(coeff + 48)); + in[13] = _mm_load_si128((const __m128i *)(coeff + 52)); + in[14] = _mm_load_si128((const __m128i *)(coeff + 56)); + in[15] = _mm_load_si128((const __m128i *)(coeff + 60)); +} + +static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; + + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + // stage 2 + u0 = in[0 * 2 + col]; + u1 = in[4 * 2 + col]; + u2 = in[2 * 2 + col]; + u3 = in[6 * 2 + col]; + + x = _mm_mullo_epi32(in[1 * 2 + col], cospi56); + y = _mm_mullo_epi32(in[7 * 2 + col], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1 * 2 + col], cospi8); + y = _mm_mullo_epi32(in[7 * 2 + col], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5 * 2 + col], cospi24); + y = _mm_mullo_epi32(in[3 * 2 + col], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5 * 2 + col], cospi40); + y = _mm_mullo_epi32(in[3 * 2 + col], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + // stage 5 + addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[8], v[8], x; + + // Even 8 points: 0, 2, ..., 14 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[14], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[14], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[10], cospi20); + x = _mm_mullo_epi32(in[4], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[10], cospi44); + x = _mm_mullo_epi32(in[4], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[6], cospi36); + x = _mm_mullo_epi32(in[8], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[6], cospi28); + x = _mm_mullo_epi32(in[8], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[2], cospi52); + x = _mm_mullo_epi32(in[12], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[2], cospi12); + x = _mm_mullo_epi32(in[12], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[2] = _mm_sub_epi32(kZero, u[4]); + out[4] = u[6]; + out[6] = _mm_sub_epi32(kZero, u[2]); + out[8] = u[3]; + out[10] = _mm_sub_epi32(kZero, u[7]); + out[12] = u[5]; + out[14] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[15], cospi4); + x = _mm_mullo_epi32(in[1], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[15], cospi60); + x = _mm_mullo_epi32(in[1], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[11], cospi20); + x = _mm_mullo_epi32(in[5], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[11], cospi44); + x = _mm_mullo_epi32(in[5], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[7], cospi36); + x = _mm_mullo_epi32(in[9], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[7], cospi28); + x = _mm_mullo_epi32(in[9], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[3], cospi52); + x = _mm_mullo_epi32(in[13], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[3], cospi12); + x = _mm_mullo_epi32(in[13], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[1] = u[0]; + out[3] = _mm_sub_epi32(kZero, u[4]); + out[5] = u[6]; + out[7] = _mm_sub_epi32(kZero, u[2]); + out[9] = u[3]; + out[11] = _mm_sub_epi32(kZero, u[7]); + out[13] = u[5]; + out[15] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + out[0] = _mm_add_epi32(in[0], in[0]); + out[1] = _mm_add_epi32(in[1], in[1]); + out[2] = _mm_add_epi32(in[2], in[2]); + out[3] = _mm_add_epi32(in[3], in[3]); + out[4] = _mm_add_epi32(in[4], in[4]); + out[5] = _mm_add_epi32(in[5], in[5]); + out[6] = _mm_add_epi32(in[6], in[6]); + out[7] = _mm_add_epi32(in[7], in[7]); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8); + } +} + +static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, + int fliplr, int bd) { + __m128i x0, x1; + const __m128i zero = _mm_setzero_si128(); + + x0 = _mm_unpacklo_epi16(pred, zero); + x1 = _mm_unpackhi_epi16(pred, zero); + + if (fliplr) { + res_lo = _mm_shuffle_epi32(res_lo, 0x1B); + res_hi = _mm_shuffle_epi32(res_hi, 0x1B); + x0 = _mm_add_epi32(res_hi, x0); + x1 = _mm_add_epi32(res_lo, x1); + + } else { + x0 = _mm_add_epi32(res_lo, x0); + x1 = _mm_add_epi32(res_hi, x1); + } + + x0 = _mm_packus_epi32(x0, x1); + return highbd_clamp_epi16(x0, bd); +} + +static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + + round_shift_8x8(in, shift); + + v0 = _mm_load_si128((__m128i const *)(output + 0 * stride)); + v1 = _mm_load_si128((__m128i const *)(output + 1 * stride)); + v2 = _mm_load_si128((__m128i const *)(output + 2 * stride)); + v3 = _mm_load_si128((__m128i const *)(output + 3 * stride)); + v4 = _mm_load_si128((__m128i const *)(output + 4 * stride)); + v5 = _mm_load_si128((__m128i const *)(output + 5 * stride)); + v6 = _mm_load_si128((__m128i const *)(output + 6 * stride)); + v7 = _mm_load_si128((__m128i const *)(output + 7 * stride)); + + if (flipud) { + u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); + u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); + u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); + u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); + u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); + u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); + u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); + u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); + } else { + u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); + u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); + u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); + u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); + u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); + u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); + u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); + u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); + } + + _mm_store_si128((__m128i *)(output + 0 * stride), u0); + _mm_store_si128((__m128i *)(output + 1 * stride), u1); + _mm_store_si128((__m128i *)(output + 2 * stride), u2); + _mm_store_si128((__m128i *)(output + 3 * stride), u3); + _mm_store_si128((__m128i *)(output + 4 * stride), u4); + _mm_store_si128((__m128i *)(output + 5 * stride), u5); + _mm_store_si128((__m128i *)(output + 6 * stride), u6); + _mm_store_si128((__m128i *)(output + 7 * stride), u7); +} + +void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16], out[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_8x8(input, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + default: assert(0); + } +} + +static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm_mullo_epi32(in[0], cospi32); + x = _mm_add_epi32(x, rnding); + x = _mm_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} + +static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm_mullo_epi32(in[1], cospi56); + y = _mm_mullo_epi32(in[7], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1], cospi8); + y = _mm_mullo_epi32(in[7], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5], cospi24); + y = _mm_mullo_epi32(in[3], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5], cospi40); + y = _mm_mullo_epi32(in[3], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + // stage 5 + addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} + +static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(kZero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m128i temp1, temp2; + temp1 = _mm_mullo_epi32(u[0], cospi16); + x = _mm_mullo_epi32(u[1], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm_mullo_epi32(u[0], cospi48); + x = _mm_mullo_epi32(u[1], cospi16); + u[5] = _mm_sub_epi32(temp2, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm_mullo_epi32(u[0], cospi32); + x = _mm_mullo_epi32(u[1], cospi32); + u[2] = _mm_add_epi32(temp1, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(temp1, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + temp1 = _mm_mullo_epi32(u[4], cospi32); + x = _mm_mullo_epi32(u[5], cospi32); + u[6] = _mm_add_epi32(temp1, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(temp1, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm_mullo_epi32(in[7], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[7], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[5], cospi20); + x = _mm_mullo_epi32(in[2], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[5], cospi44); + x = _mm_mullo_epi32(in[2], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[3], cospi36); + x = _mm_mullo_epi32(in[4], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[3], cospi28); + x = _mm_mullo_epi32(in[4], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[1], cospi52); + x = _mm_mullo_epi32(in[6], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[1], cospi12); + x = _mm_mullo_epi32(in[6], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm_mullo_epi32(in[0], cospi32); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm_add_epi32(in[0], offset); + in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + } + + in[0] = _mm_max_epi32(in[0], clamp_lo); + in[0] = _mm_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; +} + +static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); + + addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[5], cospi32); + y = _mm_mullo_epi32(u[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + u[10] = _mm_sub_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[13] = _mm_add_epi32(x, y); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_add_epi32(x, y); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + // stage 7 + addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i v[16], x, y, temp1, temp2; + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(x, rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(zero, x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm_mullo_epi32(v[8], cospi8); + x = _mm_mullo_epi32(v[9], cospi56); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[8], cospi56); + x = _mm_mullo_epi32(v[9], cospi8); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm_mullo_epi32(v[12], cospi16); + x = _mm_mullo_epi32(v[13], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[12], cospi48); + x = _mm_mullo_epi32(v[13], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + y = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + y = _mm_mullo_epi32(v[10], cospi32); + x = _mm_mullo_epi32(v[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + y = _mm_mullo_epi32(v[14], cospi32); + x = _mm_mullo_epi32(v[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i zero = _mm_setzero_si128(); + __m128i u[16], x, y; + + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + u[1] = _mm_sub_epi32(zero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + x = _mm_mullo_epi32(in[2], cospi54); + u[2] = _mm_add_epi32(x, rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + x = _mm_mullo_epi32(in[2], cospi10); + u[3] = _mm_sub_epi32(zero, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + x = _mm_mullo_epi32(in[4], cospi46); + u[4] = _mm_add_epi32(x, rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(in[4], cospi18); + u[5] = _mm_sub_epi32(zero, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(in[6], cospi38); + u[6] = _mm_add_epi32(x, rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(in[6], cospi26); + u[7] = _mm_sub_epi32(zero, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[8] = _mm_mullo_epi32(in[7], cospi34); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + u[9] = _mm_mullo_epi32(in[7], cospi30); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + u[10] = _mm_mullo_epi32(in[5], cospi42); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_mullo_epi32(in[5], cospi22); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_mullo_epi32(in[3], cospi50); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + u[13] = _mm_mullo_epi32(in[3], cospi14); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + u[14] = _mm_mullo_epi32(in[1], cospi58); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_mullo_epi32(in[1], cospi6); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 3 + addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi56); + u[8] = _mm_mullo_epi32(u[8], cospi8); + u[8] = _mm_add_epi32(u[8], x); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + x = _mm_mullo_epi32(u[9], cospi8); + u[9] = _mm_sub_epi32(y, x); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + x = _mm_mullo_epi32(u[11], cospi24); + y = _mm_mullo_epi32(u[10], cospi24); + u[10] = _mm_mullo_epi32(u[10], cospi40); + u[10] = _mm_add_epi32(u[10], x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + x = _mm_mullo_epi32(u[11], cospi40); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + x = _mm_mullo_epi32(u[13], cospi8); + y = _mm_mullo_epi32(u[12], cospi8); + u[12] = _mm_mullo_epi32(u[12], cospim56); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospim56); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi40); + y = _mm_mullo_epi32(u[14], cospi40); + u[14] = _mm_mullo_epi32(u[14], cospim24); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim24); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 5 + addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm_mullo_epi32(u[5], cospi48); + y = _mm_mullo_epi32(u[4], cospi48); + u[4] = _mm_mullo_epi32(u[4], cospi16); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(u[5], cospi16); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(u[7], cospi16); + y = _mm_mullo_epi32(u[6], cospi16); + u[6] = _mm_mullo_epi32(u[6], cospim48); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(u[7], cospim48); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + x = _mm_mullo_epi32(u[13], cospi48); + y = _mm_mullo_epi32(u[12], cospi48); + u[12] = _mm_mullo_epi32(u[12], cospi16); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospi16); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi16); + y = _mm_mullo_epi32(u[14], cospi16); + u[14] = _mm_mullo_epi32(u[14], cospim48); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim48); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 7 + addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + u[2] = _mm_add_epi32(y, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(y, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + u[10] = _mm_add_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + u[14] = _mm_add_epi32(y, x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(zero, u[8]); + out[2] = u[12]; + out[3] = _mm_sub_epi32(zero, u[4]); + out[4] = u[6]; + out[5] = _mm_sub_epi32(zero, u[14]); + out[6] = u[10]; + out[7] = _mm_sub_epi32(zero, u[2]); + out[8] = u[3]; + out[9] = _mm_sub_epi32(zero, u[11]); + out[10] = u[15]; + out[11] = _mm_sub_epi32(zero, u[7]); + out[12] = u[5]; + out[13] = _mm_sub_epi32(zero, u[13]); + out[14] = u[9]; + out[15] = _mm_sub_epi32(zero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + y = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(x, y); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(x, y); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm_mullo_epi32(v[5], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_sub_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_add_epi32(x, y); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_add_epi32(x, y); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + const __m128i zero = _mm_setzero_si128(); + __m128i u[16], v[16], x, y; + // Calculate the column 0, 1, 2, 3 + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15], cospi2); + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15], cospi62); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13], cospi10); + x = _mm_mullo_epi32(in[2], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13], cospi54); + x = _mm_mullo_epi32(in[2], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11], cospi18); + x = _mm_mullo_epi32(in[4], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11], cospi46); + x = _mm_mullo_epi32(in[4], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9], cospi26); + x = _mm_mullo_epi32(in[6], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9], cospi38); + x = _mm_mullo_epi32(in[6], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7], cospi34); + x = _mm_mullo_epi32(in[8], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7], cospi30); + x = _mm_mullo_epi32(in[8], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5], cospi42); + x = _mm_mullo_epi32(in[10], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5], cospi22); + x = _mm_mullo_epi32(in[10], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3], cospi50); + x = _mm_mullo_epi32(in[12], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3], cospi14); + x = _mm_mullo_epi32(in[12], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1], cospi58); + x = _mm_mullo_epi32(in[14], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1], cospi6); + x = _mm_mullo_epi32(in[14], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 5 + addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 7 + addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} +static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a0_high, a1_low, a1_high; + __m128i zero = _mm_setzero_si128(); + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 16; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16); + } +} +static INLINE void idct64_stage8_sse4_1( + __m128i *u, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, + clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + __m128i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, + int bd, int out_shift, + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + for (int i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, + 4); + } + } +} + +static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + + { + __m128i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + } + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} + +static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + + { + __m128i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m128i temp1, temp2; + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64]; + __m128i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi17 = _mm_set1_epi32(cospi[17]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi19 = _mm_set1_epi32(cospi[19]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi21 = _mm_set1_epi32(cospi[21]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi23 = _mm_set1_epi32(cospi[23]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi25 = _mm_set1_epi32(cospi[25]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi29 = _mm_set1_epi32(cospi[29]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi31 = _mm_set1_epi32(cospi[31]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi35 = _mm_set1_epi32(cospi[35]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi39 = _mm_set1_epi32(cospi[39]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi47 = _mm_set1_epi32(cospi[47]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, + &clamp_hi_out, 4); + } + } + } +} + +static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1; + + // stage 0 + // stage 1 + bf1 = in[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + bf1 = _mm_add_epi32(bf1, offset); + bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); + } + } + + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + out[0] = bf1; + out[1] = bf1; + out[2] = bf1; + out[3] = bf1; + out[4] = bf1; + out[5] = bf1; + out[6] = bf1; + out[7] = bf1; + out[8] = bf1; + out[9] = bf1; + out[10] = bf1; + out[11] = bf1; + out[12] = bf1; + out[13] = bf1; + out[14] = bf1; + out[15] = bf1; + out[16] = bf1; + out[17] = bf1; + out[18] = bf1; + out[19] = bf1; + out[20] = bf1; + out[21] = bf1; + out[22] = bf1; + out[23] = bf1; + out[24] = bf1; + out[25] = bf1; + out[26] = bf1; + out[27] = bf1; + out[28] = bf1; + out[29] = bf1; + out[30] = bf1; + out[31] = bf1; +} + +static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + + addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); + + addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32], bf0[32]; + + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); + + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + default: + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} +void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} +static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + for (int i = 0; i < 32; i += 16) { + out[i] = _mm_slli_epi32(in[i], 2); + out[i + 1] = _mm_slli_epi32(in[i + 1], 2); + out[i + 2] = _mm_slli_epi32(in[i + 2], 2); + out[i + 3] = _mm_slli_epi32(in[i + 3], 2); + out[i + 4] = _mm_slli_epi32(in[i + 4], 2); + out[i + 5] = _mm_slli_epi32(in[i + 5], 2); + out[i + 6] = _mm_slli_epi32(in[i + 6], 2); + out[i + 7] = _mm_slli_epi32(in[i + 7], 2); + out[i + 8] = _mm_slli_epi32(in[i + 8], 2); + out[i + 9] = _mm_slli_epi32(in[i + 9], 2); + out[i + 10] = _mm_slli_epi32(in[i + 10], 2); + out[i + 11] = _mm_slli_epi32(in[i + 11], 2); + out[i + 12] = _mm_slli_epi32(in[i + 12], 2); + out[i + 13] = _mm_slli_epi32(in[i + 13], 2); + out[i + 14] = _mm_slli_epi32(in[i + 14], 2); + out[i + 15] = _mm_slli_epi32(in[i + 15], 2); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} +static const transform_1d_sse4_1 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4x4_sse4_1, NULL, NULL, NULL }, + { iadst4x4_sse4_1, NULL, NULL, NULL }, + { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, + }, + { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, + { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, + { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, + { + { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, + NULL }, + { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, + NULL }, + { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, + }, + { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, + idct32x32_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { iidentity32_sse4_1, NULL, NULL, NULL } }, + { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, + idct64x64_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; +static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + __m128i buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} +static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[64 * 4]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[32]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, 0, txfm_size_row, + bd); + } + } +} +static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = txfm_size_col >> 2; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + __m128i buf0[64]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); + load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); + + if (lr_flip) { + TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], + buf1[7]); + } else { + TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < 2; i++) { + __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_32bit_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + // write to buffer + highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip, + txfm_size_row, bd); +} + +static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + for (int i = 0; i < (txfm_size_row >> 2); i++) { + const int32_t *input_row = input + i * 4; + __m128i *buf0_cur = buf0 + i * 4; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); + row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_32bit_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } +} + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_sse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_h_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + highbd_inv_txfm2d_add_v_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case IDTX: + highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob, + txfm_param->bd); + break; + } +} diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c new file mode 100644 index 0000000000..6dcac10e45 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -0,0 +1,849 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src, + int src_stride, uint16_t *dst0, + int dst_stride0, int w, int h, + ConvolveParams *conv_params, + int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + int i, j; + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const __m256i offset_const_16b = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits <= 4); + + if (!(w % 16)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 16) { + const __m256i src_16bit = + _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j])); + + const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); + + if (do_average) { + const __m256i data_0 = + _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero); + + const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b_lo, offset_const); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); + const __m256i res_unsigned_hi = + _mm256_add_epi32(res_32b_hi, offset_const); + + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = highbd_convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result_hi = highbd_convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + + _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 8) { + const __m128i src_row_0 = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride])); + // since not all compilers yet support _mm256_set_m128i() + const __m256i src_10 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_0), src_row_1, 1); + + const __m256i res = _mm256_sll_epi16(src_10, left_shift); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i res_32b = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b, offset_const); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b_lo, offset_const); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); + const __m256i res_unsigned_hi = + _mm256_add_epi32(res_32b_hi, offset_const); + + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[8], coeffs_y[4], coeffs_x[4]; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i round_const_x = _mm256_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i round_const_y = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = _mm256_setzero_si256(); + if (i + 1 < im_h) + row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + s[2] = _mm256_unpacklo_epi16(s4, s5); + + s[4] = _mm256_unpackhi_epi16(s0, s1); + s[5] = _mm256_unpackhi_epi16(s2, s3); + s[6] = _mm256_unpackhi_epi16(s4, s5); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + const __m256i res_a = convolve(s, coeffs_y); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_y), round_shift_y); + + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_a_round, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_b = convolve(s + 4, coeffs_y); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_y), round_shift_y); + + __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + + int i, j; + __m256i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sll_epi32(res_even, round_shift_bits); + res_odd = _mm256_sll_epi32(res_odd, round_shift_bits); + + __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd); + + __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = highbd_comp_avg( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd); + __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = highbd_convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result_hi = highbd_convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_y_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + int i, j; + __m256i s[8], coeffs_y[4]; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i round_const_y = + _mm256_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits); + res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a_round, round_const_y), round_shift_y); + + __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits); + res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b_round, round_const_y), round_shift_y); + + __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c new file mode 100644 index 0000000000..5a7fc536a2 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_sse4_1.h" + +void av1_highbd_dist_wtd_convolve_y_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + int i, j; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i round_const_y = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + __m128i s[16], coeffs_y[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits); + res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y), + round_shift_y); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits); + res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y), + round_shift_y); + + __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const); + __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i comp_avg_res_0 = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_1 = + highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b_0 = + _mm_packus_epi32(round_result_0, round_result_0); + const __m128i res_clip_0 = + _mm_min_epi16(res_16b_0, clip_pixel_to_bd); + const __m128i res_16b_1 = + _mm_packus_epi32(round_result_1, round_result_1); + const __m128i res_clip_1 = + _mm_min_epi16(res_16b_1, clip_pixel_to_bd); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), + res_clip_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_clip_1); + + } else { + __m128i res_16b_0 = + _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0); + + __m128i res_16b_1 = + _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16b_1); + } + } else { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits); + res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b_round0, round_const_y), round_shift_y); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits); + res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b_round1, round_const_y), round_shift_y); + + __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const); + __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const); + + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero); + const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero); + + const __m128i comp_avg_res_lo_0 = + highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_lo_1 = + highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi_0 = + highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi_1 = + highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, + &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_lo_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b_0 = + _mm_packus_epi32(round_result_lo_0, round_result_hi_0); + const __m128i res_clip_0 = + _mm_min_epi16(res_16b_0, clip_pixel_to_bd); + + const __m128i res_16b_1 = + _mm_packus_epi32(round_result_lo_1, round_result_hi_1); + const __m128i res_clip_1 = + _mm_min_epi16(res_16b_1, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), + res_clip_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_clip_1); + } else { + __m128i res_16bit0 = + _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0); + __m128i res_16bit1 = + _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_16bit1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + + int i, j; + __m128i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i zero = _mm_setzero_si128(); + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sll_epi32(res_even, round_shift_bits); + res_odd = _mm_sll_epi32(res_odd, round_shift_bits); + + __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd); + __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const); + if (w - j < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + + const __m128i comp_avg_res = highbd_comp_avg_sse4_1( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i round_result = highbd_convolve_rounding_sse2( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = _mm_packus_epi32(round_result, round_result); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b); + } + } else { + __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd); + __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const); + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h new file mode 100644 index 0000000000..5734810f52 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ +#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ + +#include /* SSE4.1 */ + +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + __m128i u0, u1, u2, u3; \ + u0 = _mm_unpacklo_epi32(x0, x1); \ + u1 = _mm_unpackhi_epi32(x0, x1); \ + u2 = _mm_unpacklo_epi32(x2, x3); \ + u3 = _mm_unpackhi_epi32(x2, x3); \ + y0 = _mm_unpacklo_epi64(u0, u2); \ + y1 = _mm_unpackhi_epi64(u0, u2); \ + y2 = _mm_unpacklo_epi64(u1, u3); \ + y3 = _mm_unpackhi_epi64(u1, u3); \ + } while (0) + +static INLINE void transpose_8x8(const __m128i *in, __m128i *out) { + TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); + TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); + TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); + TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], + out[15]); +} + +static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { + // Upper left 8x8 + TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]); + TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24], + out[28]); + TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9], + out[13]); + TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25], + out[29]); + + // Upper right 8x8 + TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40], + out[44]); + TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56], + out[60]); + TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41], + out[45]); + TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57], + out[61]); + + // Lower left 8x8 + TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10], + out[14]); + TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26], + out[30]); + TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11], + out[15]); + TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27], + out[31]); + // Lower right 8x8 + TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42], + out[46]); + TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58], + out[62]); + TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43], + out[47]); + TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59], + out[63]); +} + +static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output, + const int width, const int height) { + const int numcol = height >> 2; + const int numrow = width >> 2; + for (int j = 0; j < numrow; j++) { + for (int i = 0; i < numcol; i++) { + TRANSPOSE_4X4(input[i * width + j + (numrow * 0)], + input[i * width + j + (numrow * 1)], + input[i * width + j + (numrow * 2)], + input[i * width + j + (numrow * 3)], + output[j * height + i + (numcol * 0)], + output[j * height + i + (numcol * 1)], + output[j * height + i + (numcol * 2)], + output[j * height + i + (numcol * 3)]); + } + } +} + +// Note: +// rounding = 1 << (bit - 1) +static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *w1, const __m128i *n1, + const __m128i *rounding, int bit) { + __m128i x, y; + + x = _mm_mullo_epi32(*w0, *n0); + y = _mm_mullo_epi32(*w1, *n1); + x = _mm_add_epi32(x, y); + x = _mm_add_epi32(x, *rounding); + x = _mm_srai_epi32(x, bit); + return x; +} + +static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *rounding, int bit) { + __m128i x; + + x = _mm_mullo_epi32(*w0, *n0); + x = _mm_add_epi32(x, *rounding); + x = _mm_srai_epi32(x, bit); + return x; +} + +typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift); + +typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + const int num_cols); + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd); + +#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c new file mode 100644 index 0000000000..75108b49da --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c @@ -0,0 +1,656 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m256i tmp[15]; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); + const __m256i reduce_bits_vert_const = + _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); + const __m256i res_sub_const = + _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + + __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1)); + __m256i v_zeros = _mm256_setzero_si256(); + int ohoriz = 1 << offset_bits_horiz; + int mhoriz = 1 << max_bits_horiz; + (void)mhoriz; + int sx; + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + if (ix4 <= -7) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)))); + } + } else if (ix4 >= width + 6) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm256_cvtepi16_epi32( + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + int32_t tmp1[8]; + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + const int iy = clamp(iy4 + k, 0, height - 1); + + sx = sx4 + beta * (k + 4); + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + const int offs = sx >> WARPEDDIFF_PREC_BITS; + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + const int sample_x = clamp(ix + m, 0, width - 1); + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum; + sx += alpha; + } + tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1); + } + } else { + if (beta == 0 && alpha == 0) { + sx = sx4; + __m128i v_01 = _mm_loadu_si128( + (__m128i *) + av1_warped_filter[sx >> + WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 + __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 + __m256i v_c23 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 + __m256i v_c45 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 + __m256i v_c67 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = _mm256_madd_epi16( + v_c01, _mm256_alignr_epi8(v_refu, v_refl, + 0)); // R8R7R6..R1R7R6R5..R1R0 + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = _mm256_madd_epi16( + v_c23, + _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = _mm256_madd_epi16( + v_c45, _mm256_alignr_epi8(v_refu, v_refl, + 8)); // R12R11..R5R11R10..R5R4 + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16( + v_c67, _mm256_alignr_epi8(v_refu, v_refl, + 12)); // R14R13..R7R13R12..R7R6 + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + } else if (alpha == 0) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + sx = sx4 + beta * (k + 4); + + __m128i v_01 = _mm_loadu_si128( + (__m128i *)av1_warped_filter + [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 + __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 + __m256i v_c23 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 + __m256i v_c45 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 + __m256i v_c67 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = + _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = + _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = + _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16(v_c67, + _mm256_alignr_epi8(v_refu, v_refl, 12)); + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + } else if (beta == 0) { + sx = sx4; + __m256i v_coeff01 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), + 0); + v_coeff01 = _mm256_inserti128_si256( + v_coeff01, + _mm_loadu_si128( + (__m128i *) + av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]), + 1); // B7B6..B1B0A7A6..A1A0 + __m256i v_coeff23 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff23 = _mm256_inserti128_si256( + v_coeff23, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // D7D6..D1D0C7C6..C1C0 + __m256i v_coeff45 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff45 = _mm256_inserti128_si256( + v_coeff45, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // F7F6..F1F0E7E6..E1E0 + __m256i v_coeff67 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff67 = _mm256_inserti128_si256( + v_coeff67, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // H7H6..H1H0G7G6..G1G0 + + __m256i v_c0123 = _mm256_unpacklo_epi32( + v_coeff01, + v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 + __m256i v_c0123u = _mm256_unpackhi_epi32( + v_coeff01, + v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 + __m256i v_c4567 = _mm256_unpacklo_epi32( + v_coeff45, + v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 + __m256i v_c4567u = _mm256_unpackhi_epi32( + v_coeff45, + v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 + + __m256i v_c01 = _mm256_unpacklo_epi64( + v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 + __m256i v_c23 = + _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 + __m256i v_c45 = + _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 + __m256i v_c67 = + _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 + + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = _mm256_madd_epi16( + v_c01, _mm256_alignr_epi8(v_refu, v_refl, + 0)); // R8R7R6..R1R7R6R5..R1R0 + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = _mm256_madd_epi16( + v_c23, + _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = _mm256_madd_epi16( + v_c45, _mm256_alignr_epi8(v_refu, v_refl, + 8)); // R12R11..R5R11R10..R5R4 + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16( + v_c67, _mm256_alignr_epi8(v_refu, v_refl, + 12)); // R14R13..R7R13R12..R7R6 + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + + } else { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + sx = sx4 + beta * (k + 4); + + __m256i v_coeff01 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), + 0); + v_coeff01 = _mm256_inserti128_si256( + v_coeff01, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // B7B6..B1B0A7A6..A1A0 + __m256i v_coeff23 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff23 = _mm256_inserti128_si256( + v_coeff23, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // D7D6..D1D0C7C6..C1C0 + __m256i v_coeff45 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff45 = _mm256_inserti128_si256( + v_coeff45, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // F7F6..F1F0E7E6..E1E0 + __m256i v_coeff67 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff67 = _mm256_inserti128_si256( + v_coeff67, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // H7H6..H1H0G7G6..G1G0 + + __m256i v_c0123 = _mm256_unpacklo_epi32( + v_coeff01, + v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 + __m256i v_c0123u = _mm256_unpackhi_epi32( + v_coeff01, + v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 + __m256i v_c4567 = _mm256_unpacklo_epi32( + v_coeff45, + v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 + __m256i v_c4567u = _mm256_unpackhi_epi32( + v_coeff45, + v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 + + __m256i v_c01 = _mm256_unpacklo_epi64( + v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 + __m256i v_c23 = + _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 + __m256i v_c45 = + _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 + __m256i v_c67 = + _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = + _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = + _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = + _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16(v_c67, + _mm256_alignr_epi8(v_refu, v_refl, 12)); + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + const __m256i *src = tmp + (k + 4); + + __m256i v_coeff01 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]), + 0); + v_coeff01 = _mm256_inserti128_si256( + v_coeff01, + _mm_loadu_si128( + (__m128i *) + av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]), + 1); + __m256i v_coeff23 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff23 = _mm256_inserti128_si256( + v_coeff23, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 1); + __m256i v_coeff45 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff45 = _mm256_inserti128_si256( + v_coeff45, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 1); + __m256i v_coeff67 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff67 = _mm256_inserti128_si256( + v_coeff67, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 1); + + __m256i v_c0123 = _mm256_unpacklo_epi32( + v_coeff01, + v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 + __m256i v_c0123u = _mm256_unpackhi_epi32( + v_coeff01, + v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 + __m256i v_c4567 = _mm256_unpacklo_epi32( + v_coeff45, + v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 + __m256i v_c4567u = _mm256_unpackhi_epi32( + v_coeff45, + v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 + + __m256i v_c01 = _mm256_unpacklo_epi64( + v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 + __m256i v_c23 = + _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 + __m256i v_c45 = + _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 + __m256i v_c67 = + _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 + + __m256i v_src01l = + _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00 + __m256i v_src01u = + _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04 + __m256i v_sum = + _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u), + v_c01); // S7S5S3S1S6S4S2S0 + + __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]); + __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]); + v_sum = _mm256_add_epi32( + v_sum, + _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23)); + + __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]); + __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]); + v_sum = _mm256_add_epi32( + v_sum, + _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45)); + + __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]); + __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]); + v_sum = _mm256_add_epi32( + v_sum, + _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67)); + + // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0 + + __m256i v_suml = + _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0 + __m256i v_sumh = + _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1 + v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0 + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + + v_sum = _mm256_add_epi32(v_sum, res_add_const); + v_sum = + _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const), + reduce_bits_vert_shift); + if (conv_params->do_average) { + __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p)); + + if (conv_params->use_dist_wtd_comp_avg) { + v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0), + _mm256_mullo_epi32(v_sum, wt1)); + v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS); + } else { + v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1); + } + + __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const); + v_sum1 = _mm256_sra_epi32( + _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift); + + __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1); + v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8); + v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel); + _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0)); + } else { + v_sum = _mm256_packus_epi32(v_sum, v_sum); + __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8); + _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); + } + } else { + // Round and pack into 8 bits + const __m256i round_const = + _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + + __m256i v_sum1 = _mm256_srai_epi32( + _mm256_add_epi32(v_sum, round_const), reduce_bits_vert); + + v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1); + __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8); + // Clamp res_16bit to the range [0, 2^bd - 1] + const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1); + const __m256i zero = _mm256_setzero_si256(); + v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero); + + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c new file mode 100644 index 0000000000..96fb4cf632 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, + 9, 11, 13, 15 }; + +static const uint8_t highbd_shuffle_alpha0_mask0[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; +static const uint8_t highbd_shuffle_alpha0_mask1[16] = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; +static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9, + 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11 }; +static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13, + 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15 }; + +static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); + + // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 + coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 + coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); + // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 + coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); + // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 + coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + // Filter odd-index pixels + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( + int sx, __m128i *coeff) { + // Filter coeff + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); + coeff[4] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); + coeff[6] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); + + coeff[1] = coeff[0]; + coeff[3] = coeff[2]; + coeff[5] = coeff[4]; + coeff[7] = coeff[6]; +} + +static INLINE void highbd_filter_src_pixels( + const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, const int reduce_bits_horiz, int k) { + const __m128i src_1 = *src; + const __m128i src2_1 = *src2; + + const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); + + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); + + __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + // Combine results into one register. + // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7 + // as this order helps with the vertical filter. + tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); +} + +static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2, + __m128i *tmp, int sx, int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); + highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); +} + +static INLINE void highbd_warp_horizontal_filter_alpha0_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_alpha0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void highbd_prepare_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + highbd_warp_horizontal_filter_alpha0_beta0( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha == 0 && beta != 0) + highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha != 0 && beta == 0) + highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else + highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + +void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + assert(!(bd == 12 && reduce_bits_horiz < 5)); + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); + const __m128i reduce_bits_vert_const = + _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const __m128i res_sub_const = + _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + const __m128i src_01 = _mm_shuffle_epi8( + src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); + const __m128i src2_01 = _mm_shuffle_epi8( + src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); + + __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01); + __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left); + src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left); + } + + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right); + src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right); + } + + const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); + const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); + + highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } else { + highbd_prepare_warp_horizontal_filter( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + } + + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo = _mm_add_epi32(res_lo, res_add_const); + res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), + reduce_bits_vert_shift); + + if (conv_params->do_average) { + __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); + + if (conv_params->use_dist_wtd_comp_avg) { + res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), + _mm_mullo_epi32(res_lo, wt1)); + res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); + } else { + res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1); + } + + __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const); + res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const), + round_bits_shift); + + __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo); + res16_lo = _mm_min_epi16(res16_lo, clip_pixel); + _mm_storel_epi64(dst16, res16_lo); + } else { + res_lo = _mm_packus_epi32(res_lo, res_lo); + _mm_storel_epi64(p, res_lo); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + + res_hi = _mm_add_epi32(res_hi, res_add_const); + res_hi = + _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), + reduce_bits_vert_shift); + if (conv_params->do_average) { + __m128i *const dst16_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); + + if (conv_params->use_dist_wtd_comp_avg) { + res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), + _mm_mullo_epi32(res_hi, wt1)); + res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); + } else { + res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1); + } + + __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const); + res32_hi = _mm_sra_epi32( + _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift); + __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi); + res16_hi = _mm_min_epi16(res16_hi, clip_pixel); + _mm_storel_epi64(dst16_4, res16_hi); + } else { + res_hi = _mm_packus_epi32(res_hi, res_hi); + _mm_storel_epi64(p4, res_hi); + } + } + } else { + // Round and pack into 8 bits + const __m128i round_const = + _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), reduce_bits_vert); + + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + // Clamp res_16bit to the range [0, 2^bd - 1] + const __m128i max_val = _mm_set1_epi16((1 << bd) - 1); + const __m128i zero = _mm_setzero_si128(); + res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + _mm_storel_epi64(p, res_16bit); + } else { + _mm_storeu_si128(p, res_16bit); + } + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c new file mode 100644 index 0000000000..562c623fa9 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// 128-bit xmmwords are written as [ ... ] with the MSB on the left. +// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB +// on the left. +// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be +// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ]. +void av1_highbd_wiener_convolve_add_src_avx2( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + DECLARE_ALIGNED(32, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 1; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero_128 = _mm_setzero_si128(); + const __m256i zero_256 = _mm256_setzero_si256(); + + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + + const __m256i clamp_low = zero_256; + + /* Horizontal filter */ + { + const __m256i clamp_high_ep = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = _mm256_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (int i = 0; i < intermediate_height; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *src_ij = src_ptr + i * src_stride + j; + + // Load 16-bit src data + const __m256i src_0 = yy_loadu_256(src_ij + 0); + const __m256i src_1 = yy_loadu_256(src_ij + 1); + const __m256i src_2 = yy_loadu_256(src_ij + 2); + const __m256i src_3 = yy_loadu_256(src_ij + 3); + const __m256i src_4 = yy_loadu_256(src_ij + 4); + const __m256i src_5 = yy_loadu_256(src_ij + 5); + const __m256i src_6 = yy_loadu_256(src_ij + 6); + const __m256i src_7 = yy_loadu_256(src_ij + 7); + + // Multiply src data by filter coeffs and sum pairs + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + // Calculate scalar product for even- and odd-indices separately, + // increasing to 32-bit precision + const __m256i res_even_sum = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); + const __m256i res_even = _mm256_srai_epi32( + _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); + + const __m256i res_odd_sum = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); + const __m256i res_odd = _mm256_srai_epi32( + _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); + + // Reduce to 16-bit precision and pack even- and odd-index results + // back into one register. The _mm256_packs_epi32 intrinsic returns + // a register with the pixels ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i res = _mm256_packs_epi32(res_even, res_odd); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep); + + // Store in a temporary array + yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); + } + } + } + + /* Vertical filter */ + { + const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1); + + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j; + + // Load 16-bit data from the output of the horizontal filter in + // which the pixels are ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE); + const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE); + const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE); + const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE); + const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE); + const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE); + const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE); + const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE); + + // Filter the even-indices, increasing to 32-bit precision + const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); + const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); + const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); + const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); + + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + + const __m256i res_even = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); + + // Filter the odd-indices, increasing to 32-bit precision + const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); + const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); + const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); + const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); + + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + const __m256i res_odd = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); + + // Pixels are currently in the following order: + // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] + // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] + // + // Rearrange the pixels into the following order: + // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] + // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] + const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); + + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo, round_const), conv_params->round_1); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi, round_const), conv_params->round_1); + + // Reduce to 16-bit precision and pack into the correct order: + // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] + const __m256i res_16bit = + _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_16bit_clamped = _mm256_min_epi16( + _mm256_max_epi16(res_16bit, clamp_low), clamp_high); + + // Store in the dst array + yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c new file mode 100644 index 0000000000..cab37fa910 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void av1_highbd_wiener_convolve_add_src_ssse3( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 1; + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + conv_params->round_0); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + conv_params->round_0); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + const __m128i maxval = + _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), conv_params->round_1); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), conv_params->round_1); + + const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, res_16bit); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c new file mode 100644 index 0000000000..3eee46faeb --- /dev/null +++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { + if (!strength) return; + + DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { + { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 + { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 + { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 + }; + + DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { + { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + }; + + // Extend the first and last samples to simplify the loop for the 5-tap case + p[-1] = p[0]; + __m128i last = _mm_set1_epi8((char)p[sz - 1]); + _mm_storeu_si128((__m128i *)&p[sz], last); + + // Adjust input pointer for filter support area + uint8_t *in = (strength == 3) ? p - 1 : p; + + // Avoid modifying first sample + uint8_t *out = p + 1; + int len = sz - 1; + + const int use_3tap_filter = (strength < 3); + + if (use_3tap_filter) { + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); + __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); + __m128i in0 = _mm_lddqu_si128((__m128i *)in); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i d0 = _mm_shuffle_epi8(in0, shuf0); + __m128i d1 = _mm_shuffle_epi8(in0, shuf1); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srai_epi16(d0, 4); + d0 = _mm_packus_epi16(d0, d0); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi8(n_out); + __m128i mask = _mm_cmpgt_epi8(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storel_epi64((__m128i *)out, out0); + __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); + in0 = _mm_alignr_epi8(in1, in0, 8); + in += 8; + out += 8; + len -= n_out; + } + } else { // 5-tap filter + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i two = _mm_set1_epi8(2); + __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); + __m128i shuf_b = _mm_add_epi8(shuf_a, two); + __m128i shuf_c = _mm_add_epi8(shuf_b, two); + __m128i shuf_d = _mm_add_epi8(shuf_c, two); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); + __m128i in0 = _mm_lddqu_si128((__m128i *)in); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); + __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); + __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); + __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d2 = _mm_maddubs_epi16(d2, coef0); + d3 = _mm_maddubs_epi16(d3, coef0); + d0 = _mm_hadd_epi16(d0, d1); + d2 = _mm_hadd_epi16(d2, d3); + d0 = _mm_hadd_epi16(d0, d2); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srai_epi16(d0, 4); + d0 = _mm_packus_epi16(d0, d0); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi8(n_out); + __m128i mask = _mm_cmpgt_epi8(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storel_epi64((__m128i *)out, out0); + __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); + in0 = _mm_alignr_epi8(in1, in0, 8); + in += 8; + out += 8; + len -= n_out; + } + } +} + +void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { + // interpolate half-sample positions + assert(sz <= 24); + + DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { + { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } + }; + + DECLARE_ALIGNED( + 16, static const int8_t, + v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; + + // Extend first/last samples (upper-left p[-1], last p[sz-1]) + // to support 4-tap filter + p[-2] = p[-1]; + p[sz] = p[sz - 1]; + + uint8_t *in = &p[-2]; + uint8_t *out = &p[-2]; + + int n = sz + 1; // Input length including upper-left sample + + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); + + __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); + __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); + + while (n > 0) { + __m128i in8 = _mm_alignr_epi8(in16, in0, 8); + __m128i d0 = _mm_shuffle_epi8(in0, shuf0); + __m128i d1 = _mm_shuffle_epi8(in0, shuf1); + __m128i d2 = _mm_shuffle_epi8(in8, shuf0); + __m128i d3 = _mm_shuffle_epi8(in8, shuf1); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d2 = _mm_maddubs_epi16(d2, coef0); + d3 = _mm_maddubs_epi16(d3, coef0); + d0 = _mm_hadd_epi16(d0, d1); + d2 = _mm_hadd_epi16(d2, d3); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d2 = _mm_add_epi16(d2, eight); + d0 = _mm_srai_epi16(d0, 4); + d2 = _mm_srai_epi16(d2, 4); + d0 = _mm_packus_epi16(d0, d2); + __m128i in1 = _mm_alignr_epi8(in16, in0, 1); + __m128i out0 = _mm_unpacklo_epi8(in1, d0); + __m128i out1 = _mm_unpackhi_epi8(in1, d0); + _mm_storeu_si128((__m128i *)&out[0], out0); + _mm_storeu_si128((__m128i *)&out[16], out1); + in0 = in16; + in16 = _mm_setzero_si128(); + out += 32; + n -= 16; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH + +void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) { + if (!strength) return; + + DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { + { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 + { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 + { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 + }; + + DECLARE_ALIGNED(16, static const int16_t, + v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; + + // Extend the first and last samples to simplify the loop for the 5-tap case + p[-1] = p[0]; + __m128i last = _mm_set1_epi16(p[sz - 1]); + _mm_storeu_si128((__m128i *)&p[sz], last); + + // Adjust input pointer for filter support area + uint16_t *in = (strength == 3) ? p - 1 : p; + + // Avoid modifying first sample + uint16_t *out = p + 1; + int len = sz - 1; + + const int use_3tap_filter = (strength < 3); + + if (use_3tap_filter) { + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in02 = _mm_add_epi16(in0, in2); + __m128i d0 = _mm_unpacklo_epi16(in02, in1); + __m128i d1 = _mm_unpackhi_epi16(in02, in1); + d0 = _mm_mullo_epi16(d0, coef0); + d1 = _mm_mullo_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srli_epi16(d0, 4); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi16(n_out); + __m128i mask = _mm_cmpgt_epi16(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storeu_si128((__m128i *)out, out0); + in += 8; + in0 = in8; + in8 = _mm_lddqu_si128((__m128i *)&in[8]); + out += 8; + len -= n_out; + } + } else { // 5-tap filter + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in3 = _mm_alignr_epi8(in8, in0, 6); + __m128i in4 = _mm_alignr_epi8(in8, in0, 8); + __m128i in04 = _mm_add_epi16(in0, in4); + __m128i in123 = _mm_add_epi16(in1, in2); + in123 = _mm_add_epi16(in123, in3); + __m128i d0 = _mm_unpacklo_epi16(in04, in123); + __m128i d1 = _mm_unpackhi_epi16(in04, in123); + d0 = _mm_mullo_epi16(d0, coef0); + d1 = _mm_mullo_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srli_epi16(d0, 4); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi16(n_out); + __m128i mask = _mm_cmpgt_epi16(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storeu_si128((__m128i *)out, out0); + in += 8; + in0 = in8; + in8 = _mm_lddqu_si128((__m128i *)&in[8]); + out += 8; + len -= n_out; + } + } +} + +void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) { + // interpolate half-sample positions + assert(sz <= 24); + + DECLARE_ALIGNED(16, static const int16_t, + kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; + + // Extend first/last samples (upper-left p[-1], last p[sz-1]) + // to support 4-tap filter + p[-2] = p[-1]; + p[sz] = p[sz - 1]; + + uint16_t *in = &p[-2]; + uint16_t *out = in; + int n = sz + 1; + + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); + __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); + + while (n > 0) { + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in3 = _mm_alignr_epi8(in8, in0, 6); + __m128i sum0 = _mm_add_epi16(in0, in3); + __m128i sum1 = _mm_add_epi16(in1, in2); + __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); + __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); + __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); + d0 = _mm_madd_epi16(d0, coef0); + d1 = _mm_madd_epi16(d1, coef0); + __m128i eight = _mm_set1_epi32(8); + d0 = _mm_add_epi32(d0, eight); + d1 = _mm_add_epi32(d1, eight); + d0 = _mm_srai_epi32(d0, 4); + d1 = _mm_srai_epi32(d1, 4); + d0 = _mm_packus_epi32(d0, d1); + __m128i max0 = _mm_set1_epi16((1 << bd) - 1); + d0 = _mm_min_epi16(d0, max0); + __m128i out0 = _mm_unpacklo_epi16(in1, d0); + __m128i out1 = _mm_unpackhi_epi16(in1, d0); + _mm_storeu_si128((__m128i *)&out[0], out0); + _mm_storeu_si128((__m128i *)&out[8], out1); + in0 = in8; + in8 = in16; + in16 = in24; + in24 = _mm_setzero_si128(); + out += 16; + n -= 8; + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c new file mode 100644 index 0000000000..9f82ed2300 --- /dev/null +++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c @@ -0,0 +1,1124 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "av1/common/convolve.h" + +static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); + const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); + const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + return wt; +} + +static INLINE __m256i load_line2_avx2(const void *a, const void *b) { + return _mm256_permute2x128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); +} + +void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + int i, j, is_horiz_4tap = 0; + const int bits = FILTER_BITS - conv_params->round_1; + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + assert(bits >= 0); + assert(conv_params->round_0 > 0); + + const __m256i round_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + __m256i filt[4], coeffs[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; + + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x(data, coeffs, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + int i, j, is_vert_4tap = 0; + // +1 to compensate for dividing the filter coeffs by 2 + const int left_shift = FILTER_BITS - conv_params->round_0 + 1; + const __m256i round_const = + _mm256_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int offset_1 = (1 << (bd + FILTER_BITS - 2)); + const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); + const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + const __m256i zero = _mm256_setzero_si256(); + __m256i coeffs[4], s[8]; + + assert((FILTER_BITS - conv_params->round_0) >= 0); + + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src4; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[4]; + __m256i src_a[5]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 4; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src4 = src_a[4]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + + s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + } + + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 5) * src_stride + j]; + const __m256i src5 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); + + src4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); + + res_lo = _mm256_add_epi16(res_lo, offset_const_1); + + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); + + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } + + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + __m256i res_lo = convolve_lowbd(s, coeffs); + + res_lo = _mm256_add_epi16(res_lo, offset_const_1); + + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); + + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + + int im_stride = 8; + int i, is_horiz_4tap = 0, is_vert_4tap = 0; + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h = _mm256_set1_epi16( + ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + __m256i filt[4], coeffs_x[4], coeffs_y[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) + is_vert_4tap = 1; + + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + for (i = 0; i < im_h; i += 2) { + __m256i data = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); + src_h += (src_stride << 1); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), + round_shift_h); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + /* Vertical filter */ + __m256i s[6]; + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + + s[3] = _mm256_unpackhi_epi16(s0, s1); + s[4] = _mm256_unpackhi_epi16(s2, s3); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); + + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); + + const __m256i res_a = convolve_4tap(s, coeffs_y + 1); + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + + if (w - j > 4) { + const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } + } +} + +#define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ + do { \ + src_0 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ + src_1 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ + src_2 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ + src_3 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ + \ + src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ + src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ + src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ + src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ + \ + src_0 = _mm256_add_epi16(src_0, offset_const); \ + src_1 = _mm256_add_epi16(src_1, offset_const); \ + src_2 = _mm256_add_epi16(src_2, offset_const); \ + src_3 = _mm256_add_epi16(src_3, offset_const); \ + \ + _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ + _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ + _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ + _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ + } while (0) + +#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) +static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( + const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, + int w, int h, const __m256i offset_const) { + int i = h; + if (w >= 16) { + __m256i src_0, src_1, src_2, src_3; + if (w == 128) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); + DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); + src += 1 * src_stride; + dst += 1 * dst_stride; + i -= 1; + } while (i); + } else if (w == 64) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); + src += 1 * src_stride; + dst += 1 * dst_stride; + i -= 1; + } while (i); + } else if (w == 32) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); + src += 2 * src_stride; + dst += 2 * dst_stride; + i -= 2; + } while (i); + } else if (w == 16) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + i -= 4; + } while (i); + } + } else { + const __m256i zero = _mm256_setzero_si256(); + do { + const __m128i src_row_0 = + _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); + const __m128i src_row_1 = + _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); + const __m128i src_row_2 = + _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); + const __m128i src_row_3 = + _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); + + __m256i src_10 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_0), src_row_1, 1); + __m256i src_32 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_2), src_row_3, 1); + + src_10 = _mm256_unpacklo_epi8(src_10, zero); + src_32 = _mm256_unpacklo_epi8(src_32, zero); + + src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); + src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); + + src_10 = _mm256_add_epi16(src_10, offset_const); + src_32 = _mm256_add_epi16(src_32, offset_const); + + // Accumulate values into the destination buffer + _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), + _mm256_castsi256_si128(src_10)); + _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), + _mm256_extracti128_si256(src_10, 1)); + _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), + _mm256_castsi256_si128(src_32)); + _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), + _mm256_extracti128_si256(src_32, 1)); + + src += 4 * src_stride; + dst += 4 * dst_stride; + i -= 4; + } while (i); + } +} + +#define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ + do { \ + src_0 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ + src_1 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ + src_2 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ + src_3 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ + \ + src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ + src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ + src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ + src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ + src_0 = _mm256_add_epi16(src_0, offset_const); \ + src_1 = _mm256_add_epi16(src_1, offset_const); \ + src_2 = _mm256_add_epi16(src_2, offset_const); \ + src_3 = _mm256_add_epi16(src_3, offset_const); \ + \ + ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ + ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ + ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ + ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ + \ + res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ + res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ + res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ + res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ + \ + res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ + rounding_shift); \ + res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ + rounding_shift); \ + res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ + rounding_shift); \ + res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ + rounding_shift); \ + \ + res_10 = _mm256_packus_epi16(res_0, res_1); \ + res_32 = _mm256_packus_epi16(res_2, res_3); \ + res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ + res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ + \ + _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ + _mm256_castsi256_si128(res_10)); \ + _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ + _mm256_extracti128_si256(res_10, 1)); \ + _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ + _mm256_castsi256_si128(res_32)); \ + _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ + _mm256_extracti128_si256(res_32, 1)); \ + } while (0) + +#define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ + int i = h; \ + if (w >= 16) { \ + __m256i src_0, src_1, src_2, src_3; \ + __m256i ref_0, ref_1, ref_2, ref_3; \ + __m256i res_0, res_1, res_2, res_3; \ + __m256i res_10, res_32; \ + if (w == 128) { \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ + i -= 1; \ + src += 1 * src_stride; \ + dst += 1 * dst_stride; \ + dst0 += 1 * dst_stride0; \ + } while (i); \ + } else if (w == 64) { \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ + \ + i -= 1; \ + src += 1 * src_stride; \ + dst += 1 * dst_stride; \ + dst0 += 1 * dst_stride0; \ + } while (i); \ + } else if (w == 32) { \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ + \ + i -= 2; \ + src += 2 * src_stride; \ + dst += 2 * dst_stride; \ + dst0 += 2 * dst_stride0; \ + } while (i); \ + } else { \ + assert(w == 16); \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ + \ + i -= 4; \ + src += 4 * src_stride; \ + dst += 4 * dst_stride; \ + dst0 += 4 * dst_stride0; \ + } while (i); \ + } \ + } else if (w == 8) { \ + do { \ + const __m128i src_0 = \ + _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ + const __m128i src_1 = \ + _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ + const __m128i src_2 = \ + _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ + const __m128i src_3 = \ + _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ + __m256i src_10 = \ + _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ + __m256i src_32 = \ + _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ + \ + src_10 = _mm256_unpacklo_epi8(src_10, zero); \ + src_32 = _mm256_unpacklo_epi8(src_32, zero); \ + \ + src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ + src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ + \ + src_10 = _mm256_add_epi16(src_10, offset_const); \ + src_32 = _mm256_add_epi16(src_32, offset_const); \ + \ + const __m256i ref_10 = \ + load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ + const __m256i ref_32 = \ + load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ + __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ + __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ + \ + res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ + rounding_shift); \ + res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ + rounding_shift); \ + \ + __m256i res = _mm256_packus_epi16(res_10, res_32); \ + const __m128i res_20 = _mm256_castsi256_si128(res); \ + const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ + \ + _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ + _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ + _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ + _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ + i -= 4; \ + src += 4 * src_stride; \ + dst += 4 * dst_stride; \ + dst0 += 4 * dst_stride0; \ + } while (i); \ + } else { \ + assert(w == 4); \ + do { \ + __m256i src_3210_8bit = \ + _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ + loadu_int32(src + 1 * src_stride), 0, 0, \ + loadu_int32(src + 2 * src_stride), \ + loadu_int32(src + 3 * src_stride), 0, 0); \ + \ + __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ + src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ + src_3210 = _mm256_add_epi16(src_3210, offset_const); \ + \ + __m256i ref_3210 = \ + _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ + *(int64_t *)(dst + 1 * dst_stride), \ + *(int64_t *)(dst + 2 * dst_stride), \ + *(int64_t *)(dst + 3 * dst_stride)); \ + __m256i res_3210 = \ + comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ + \ + res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ + rounding_shift); \ + \ + res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ + const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ + const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ + \ + *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ + *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ + *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ + *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ + i -= 4; \ + src += 4 * src_stride; \ + dst += 4 * dst_stride; \ + dst0 += 4 * dst_stride0; \ + } while (i); \ + } + +void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, + int h, ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + assert(conv_params->round_0 == 3); + assert(conv_params->round_1 == 7); + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const __m256i wt = unpack_weights_avx2(conv_params); + const __m256i zero = _mm256_setzero_si256(); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + if (do_average) { + if (use_dist_wtd_comp_avg) { + DO_AVG_2D_COPY(1) + } else { + DO_AVG_2D_COPY(0) + } + } else { + av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, + w, h, offset_const); + } +} +#undef LEFT_SHIFT diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c new file mode 100644 index 0000000000..8c5d9918fb --- /dev/null +++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c @@ -0,0 +1,606 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + __m128i coeffs[4]; + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); + + if (w == 4) { + do { + const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + const __m128i res_lo = convolve_lo_x(s, coeffs); + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[0]), res_unsigned); + } + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } while (--h); + } else { + assert(!(w % 8)); + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + // Filter even-index pixels + s[0] = data; + s[1] = _mm_srli_si128(data, 2); + s[2] = _mm_srli_si128(data, 4); + s[3] = _mm_srli_si128(data, 6); + const __m128i res_even = convolve_lo_x(s, coeffs); + + // Filter odd-index pixels + s[0] = _mm_srli_si128(data, 1); + s[1] = _mm_srli_si128(data, 3); + s[2] = _mm_srli_si128(data, 5); + s[3] = _mm_srli_si128(data, 7); + const __m128i res_odd = convolve_lo_x(s, coeffs); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); + const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + j += 8; + } while (j < w); + } while (++i < h); + } +} + +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset); + const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + __m128i coeffs[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); + + if (w == 4) { + __m128i s[8], src6, res, res_shift; + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6); + + do { + s[6] = _mm_unpacklo_epi8( + src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride))); + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6); + + res = convolve_lo_y(s + 0, coeffs); + res_shift = _mm_sll_epi32(res, left_shift); + res_shift = + _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); + + __m128i res_16b = _mm_packs_epi32(res_shift, res_shift); + __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + + } else { + _mm_store_si128((__m128i *)dst, res_unsigned); + } + + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + + res = convolve_lo_y(s + 1, coeffs); + res_shift = _mm_sll_epi32(res, left_shift); + res_shift = + _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); + + res_16b = _mm_packs_epi32(res_shift, res_shift); + res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + + } else { + _mm_store_si128((__m128i *)dst, res_unsigned); + } + + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + h -= 2; + } while (h); + } else { + assert(!(w % 8)); + int j = 0; + do { + __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift; + const uint8_t *data = &src_ptr[j]; + + src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[6] = _mm_unpacklo_epi8( + src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); + + res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels + res_lo_shift = _mm_sll_epi32(res_lo, left_shift); + res_hi_shift = _mm_sll_epi32(res_hi, left_shift); + res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), + round_shift); + res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), + round_shift); + + __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + i++; + + res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels + res_lo_shift = _mm_sll_epi32(res_lo, left_shift); + res_hi_shift = _mm_sll_epi32(res_hi, left_shift); + res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), + round_shift); + res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), + round_shift); + res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + } while (i < h); + j += 8; + } while (j < w); + } +} + +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i temp_lo, temp_hi; + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 4); + temp_hi = _mm_slli_si128(src_hi, 12); + const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 8); + temp_hi = _mm_slli_si128(src_hi, 8); + const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 12); + temp_hi = _mm_slli_si128(src_hi, 4); + const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + temp_lo = _mm_srli_si128(src_lo, 2); + temp_hi = _mm_slli_si128(src_hi, 14); + const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 6); + temp_hi = _mm_slli_si128(src_hi, 10); + const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 10); + temp_hi = _mm_slli_si128(src_hi, 6); + const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 14); + temp_hi = _mm_slli_si128(src_hi, 2); + const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c new file mode 100644 index 0000000000..f6bf67815d --- /dev/null +++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_dist_wtd_convolve_2d_ssse3( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c new file mode 100644 index 0000000000..71fab7a577 --- /dev/null +++ b/third_party/aom/av1/common/x86/reconinter_avx2.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "av1/common/blockd.h" + +static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, + const __m256i s1) { + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); + return _mm256_abs_epi16( + _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} +void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = xx_loadl_32(src0); + const __m128i s0B = xx_loadl_32(src0 + src0_stride); + const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); + const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); + const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); + + const __m128i s1A = xx_loadl_32(src1); + const __m128i s1B = xx_loadl_32(src1 + src1_stride); + const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); + const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); + const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); + const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + const __m128i x_m8 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); + xx_storeu_128(mask, x_m8); + src0 += (src0_stride << 2); + src1 += (src1_stride << 2); + mask += 16; + i += 4; + } while (i < h); + } else if (8 == w) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); + const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); + const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); + const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); + const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); + const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); + yy_storeu_256(mask, m8); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (16 == w) { + do { + const __m128i s0A = xx_load_128(src0); + const __m128i s0B = xx_load_128(src0 + src0_stride); + const __m128i s1A = xx_load_128(src1); + const __m128i s1B = xx_load_128(src1 + src1_stride); + const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); + const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); + const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); + const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); + + const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); + const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); + + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); + yy_storeu_256(mask, m8); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else { + do { + int j = 0; + do { + const __m256i s0 = yy_loadu_256(src0 + j); + const __m256i s1 = yy_loadu_256(src1 + j); + const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); + const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); + const __m256i s0H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); + const __m256i s1H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); + const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); + const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); + yy_storeu_256(mask + j, m8); + j += 32; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + mask += w; + i += 1; + } while (i < h); + } +} + +static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + return diff_clamp; +} + +static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, + int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); + return diff_const_16; +} + +static INLINE void build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = + calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + + if (mask_type == DIFFWTD_38) { + build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } else { + build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH + +void av1_build_compound_diffwtd_mask_highbd_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + if (w < 16) { + av1_build_compound_diffwtd_mask_highbd_ssse3( + mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd); + } else { + assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); + assert(bd >= 8); + assert((w % 16) == 0); + const __m256i y0 = _mm256_setzero_si256(); + const __m256i yAOM_BLEND_A64_MAX_ALPHA = + _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const int mask_base = 38; + const __m256i ymask_base = _mm256_set1_epi16(mask_base); + const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); + const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); + if (bd == 8) { + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_srai_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_srai_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } else { + const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_sra_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_sra_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c new file mode 100644 index 0000000000..eb4a4d1da3 --- /dev/null +++ b/third_party/aom/av1/common/x86/reconinter_sse4.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "av1/common/blockd.h" +#include "config/av1_rtcd.h" + +static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0, + const __m128i s1) { + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1)); + return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} + +void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m128i mask_base = _mm_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0); + const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0)); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0 = _mm_cvtepu8_epi16(s0AB); + + const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1); + const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1)); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1 = _mm_cvtepu8_epi16(s1AB); + + const __m128i m16 = calc_mask(mask_base, s0, s1); + const __m128i m8 = _mm_packus_epi16(m16, m16); + + *(int *)mask = _mm_cvtsi128_si32(m8); + *(int *)(mask + w) = _mm_extract_epi32(m8, 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += 8; + i += 2; + } while (i < h); + } else if (8 == w) { + do { + __m128i s0 = _mm_loadl_epi64((__m128i const *)src0); + __m128i s1 = _mm_loadl_epi64((__m128i const *)src1); + s0 = _mm_cvtepu8_epi16(s0); + s1 = _mm_cvtepu8_epi16(s1); + const __m128i m16 = calc_mask(mask_base, s0, s1); + const __m128i m8 = _mm_packus_epi16(m16, m16); + _mm_storel_epi64((__m128i *)mask, m8); + src0 += stride0; + src1 += stride1; + mask += 8; + i += 1; + } while (i < h); + } else { + const __m128i zero = _mm_setzero_si128(); + do { + int j = 0; + do { + const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j)); + const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j)); + const __m128i s0L = _mm_cvtepu8_epi16(s0); + const __m128i s1L = _mm_cvtepu8_epi16(s1); + const __m128i s0H = _mm_unpackhi_epi8(s0, zero); + const __m128i s1H = _mm_unpackhi_epi8(s1, zero); + + const __m128i m16L = calc_mask(mask_base, s0L, s1L); + const __m128i m16H = calc_mask(mask_base, s0H, s1H); + + const __m128i m8 = _mm_packus_epi16(m16L, m16H); + _mm_store_si128((__m128i *)(mask + j), m8); + j += 16; + } while (j < w); + src0 += stride0; + src1 += stride1; + mask += w; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_sse4_1( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1; + const int mask_base = 38; + int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + const __m128i round_const = _mm_set1_epi16((1 << round) >> 1); + const __m128i mask_base_16 = _mm_set1_epi16(mask_base); + const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i add_const = + _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0)); + const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1)); + + int i, j; + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data_src0 = + _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]); + const __m128i data_src1 = + _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]); + + const __m128i diffa = _mm_subs_epu16(data_src0, data_src1); + const __m128i diffb = _mm_subs_epu16(data_src1, data_src0); + const __m128i diff = _mm_max_epu16(diffa, diffb); + const __m128i diff_round = + _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round); + const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16); + __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff); + // clamp to 0 can be skipped since we are using add and saturate + // instruction + + const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign); + const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16); + + // Store values into the destination buffer + __m128i *const dst = (__m128i *)&mask[i * w + j]; + + if ((w - j) > 4) { + _mm_storel_epi64(dst, res_8); + } else { // w==4 + *(int *)dst = _mm_cvtsi128_si32(res_8); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c new file mode 100644 index 0000000000..c9a3709a62 --- /dev/null +++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#if CONFIG_AV1_HIGHBITDEPTH + +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/blockd.h" + +void av1_build_compound_diffwtd_mask_highbd_ssse3( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + if (w < 8) { + av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride, + src1, src1_stride, h, w, bd); + } else { + assert(bd >= 8); + assert((w % 8) == 0); + assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); + const __m128i x0 = _mm_setzero_si128(); + const __m128i xAOM_BLEND_A64_MAX_ALPHA = + _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const int mask_base = 38; + const __m128i xmask_base = _mm_set1_epi16(mask_base); + const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); + const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); + if (bd == 8) { + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), + DIFF_FACTOR_LOG2); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), + DIFF_FACTOR_LOG2); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } else { + const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = + _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = + _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/resize_ssse3.c b/third_party/aom/av1/common/x86/resize_ssse3.c new file mode 100644 index 0000000000..a7fdb5a9a4 --- /dev/null +++ b/third_party/aom/av1/common/x86/resize_ssse3.c @@ -0,0 +1,974 @@ +/* + * + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSSE3 +#include "config/av1_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_ssse3.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "av1/common/resize.h" + +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} + +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi16(0x00FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); +} + +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase + 1 * step_q4; + const int offset2_q4 = phase + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs shuffle_filter_func_list[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs convolve8_func_list[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0); + shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; + int y; + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { + int x; + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); + } + + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); +} + +// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling +// in SSSE3. +static INLINE bool has_normative_scaler_ssse3(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + const bool has_normative_scaler = + (2 * dst_width == src_width && 2 * dst_height == src_height) || + (4 * dst_width == src_width && 4 * dst_height == src_height) || + (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) || + (dst_width == src_width * 2 && dst_height == src_height * 2); + + return has_normative_scaler; +} + +void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase, const int num_planes) { + bool has_normative_scaler = + has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height, + dst->y_crop_width, dst->y_crop_height); + + if (num_planes > 1) { + has_normative_scaler = + has_normative_scaler && + has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height, + dst->uv_crop_width, dst->uv_crop_height); + } + + if (!has_normative_scaler) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + return; + } + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + int malloc_failed = 0; + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const int src_y_w = (src->crop_widths[0] + 1) & ~1; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; + const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + // 2 to 1 + if (phase == 0) { + scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0c1); + } else { + const int buffer_stride = (dst_y_w + 3) & ~3; + const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + if (phase == 0) { + scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0c1); + } else { + const int buffer_stride = (dst_y_w + 1) & ~1; + const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read + // overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_y_w + 7) & ~7; + const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read + // overflow. For example, the horizontal filter generates 18 pixels but + // the vertical filter reads 24 pixels in a row. The difference is + // multiplied by 2 since two rows are interlaced together in the + // optimization. + const int extra_padding = + (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel, phase, temp_buffer); + free(temp_buffer); + } else { + assert(dst_w == src_w * 2 && dst_h == src_h * 2); + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7)); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], src_w, + src_h, interp_kernel[8], temp_buffer); + free(temp_buffer); + } + } + + if (malloc_failed) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + } else { + aom_extend_frame_borders(dst, num_planes); + } +} diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c new file mode 100644 index 0000000000..5ab6c46f8a --- /dev/null +++ b/third_party/aom/av1/common/x86/selfguided_avx2.c @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to +// 32-bit precision and return them in an AVX2 register. +static __m256i yy256_load_extend_8_32(const void *p) { + return _mm256_cvtepu8_epi32(xx_loadl_64(p)); +} + +// Load 8 halfwords from the possibly-misaligned pointer p, extend each +// halfword to 32-bit precision and return them in an AVX2 register. +static __m256i yy256_load_extend_16_32(const void *p) { + return _mm256_cvtepu16_epi32(xx_loadu_128(p)); +} + +// Compute the scan of an AVX2 register holding 8 32-bit integers. If the +// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., +// x0+x1+...+x7 +// +// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers +// (assumed small enough to be able to add them without overflow). +// +// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. +// +// x = [h g f e][d c b a] +// x01 = [g f e 0][c b a 0] +// x02 = [g+h f+g e+f e][c+d b+c a+b a] +// x03 = [e+f e 0 0][a+b a 0 0] +// x04 = [e->h e->g e->f e][a->d a->c a->b a] +// s = a->d +// s01 = [a->d a->d a->d a->d] +// s02 = [a->d a->d a->d a->d][0 0 0 0] +// ret = [a->h a->g a->f a->e][a->d a->c a->b a] +static __m256i scan_32(__m256i x) { + const __m256i x01 = _mm256_slli_si256(x, 4); + const __m256i x02 = _mm256_add_epi32(x, x01); + const __m256i x03 = _mm256_slli_si256(x02, 8); + const __m256i x04 = _mm256_add_epi32(x02, x03); + const int32_t s = _mm256_extract_epi32(x04, 3); + const __m128i s01 = _mm_set1_epi32(s); + const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1); + return _mm256_add_epi32(x04, s02); +} + +// Compute two integral images from src. B sums elements; A sums their +// squares. The images are offset by one pixel, so will have width and height +// equal to width + 1, height + 1 and the first row and column will be zero. +// +// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple +// of 8. + +static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { + unsigned int i = 0; + for (i = 0; i < (count & 0xffffffe0); i += 32) { + _mm256_storeu_si256((__m256i *)(dest + i), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero); + } + for (; i < (count & 0xfffffff8); i += 8) { + _mm256_storeu_si256((__m256i *)(dest + i), *zero); + } + for (; i < count; i++) { + dest[i] = 0; + } + return dest; +} + +static void integral_images(const uint8_t *src, int src_stride, int width, + int height, int32_t *A, int32_t *B, + int buf_stride) { + const __m256i zero = _mm256_setzero_si256(); + // Write out the zero top row + memset_zero_avx(A, &zero, (width + 8)); + memset_zero_avx(B, &zero, (width + 8)); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the eight lanes. + __m256i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 8) { + const int ABj = 1 + j; + + const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); + const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); + + const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride); + const __m256i x2 = _mm256_madd_epi16(x1, x1); + + const __m256i sc1 = scan_32(x1); + const __m256i sc2 = scan_32(x2); + + const __m256i row1 = + _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); + const __m256i row2 = + _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); + + yy_store_256(B + ABj + (i + 1) * buf_stride, row1); + yy_store_256(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); + ldiff2 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); + } + } +} + +// Compute two integral images from src. B sums elements; A sums their squares +// +// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. +static void integral_images_highbd(const uint16_t *src, int src_stride, + int width, int height, int32_t *A, + int32_t *B, int buf_stride) { + const __m256i zero = _mm256_setzero_si256(); + // Write out the zero top row + memset_zero_avx(A, &zero, (width + 8)); + memset_zero_avx(B, &zero, (width + 8)); + + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the eight lanes. + __m256i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 8) { + const int ABj = 1 + j; + + const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); + const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); + + const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride); + const __m256i x2 = _mm256_madd_epi16(x1, x1); + + const __m256i sc1 = scan_32(x1); + const __m256i sc2 = scan_32(x2); + + const __m256i row1 = + _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); + const __m256i row2 = + _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); + + yy_store_256(B + ABj + (i + 1) * buf_stride, row1); + yy_store_256(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); + ldiff2 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); + } + } +} + +// Compute 8 values of boxsum from the given integral image. ii should point +// at the middle of the box (for the first value). r is the box radius. +static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { + const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride); + const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride); + const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride); + const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride); + const __m256i u = _mm256_sub_epi32(tr, tl); + const __m256i v = _mm256_sub_epi32(br, bl); + return _mm256_sub_epi32(v, u); +} + +static __m256i round_for_shift(unsigned shift) { + return _mm256_set1_epi32((1 << shift) >> 1); +} + +static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { + __m256i an, bb; + if (bit_depth > 8) { + const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8)); + const __m256i rounding_b = round_for_shift(bit_depth - 8); + const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); + const __m256i a = + _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a); + const __m256i b = + _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b); + // b < 2^14, so we can use a 16-bit madd rather than a 32-bit + // mullo to square it + bb = _mm256_madd_epi16(b, b); + an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb); + } else { + bb = _mm256_madd_epi16(sum1, sum1); + an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n)); + } + return _mm256_sub_epi32(an, bb); +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, + int width, int height, int buf_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); + + const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m256i mask[8]; + for (int idx = 0; idx < 8; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); + mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; ++i) { + for (int j = -1; j < width + 1; j += 8) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); + + if (idx < 8) { + sum1 = _mm256_and_si256(mask[idx], sum1); + sum2 = _mm256_and_si256(mask[idx], sum2); + } + + const __m256i p = compute_p(sum1, sum2, bit_depth, n); + + const __m256i z = _mm256_min_epi32( + _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm256_set1_epi32(255)); + + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); + + yy_storeu_256(A + i * buf_stride + j, a_res); + + const __m256i a_complement = + _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); + const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); + const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), + SGRPROJ_RECIP_BITS); + + yy_storeu_256(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter +// where the outer four corners have weight 3 and all other pixels have weight +// 4. +// +// Pixels are indexed as follows: +// xtl xt xtr +// xl x xr +// xbl xb xbr +// +// buf points to x +// +// fours = xl + xt + xr + xb + x +// threes = xtl + xtr + xbr + xbl +// cross_sum = 4 * fours + 3 * threes +// = 4 * (fours + threes) - threes +// = (fours + threes) << 2 - threes +static INLINE __m256i cross_sum(const int32_t *buf, int stride) { + const __m256i xtl = yy_loadu_256(buf - 1 - stride); + const __m256i xt = yy_loadu_256(buf - stride); + const __m256i xtr = yy_loadu_256(buf + 1 - stride); + const __m256i xl = yy_loadu_256(buf - 1); + const __m256i x = yy_loadu_256(buf); + const __m256i xr = yy_loadu_256(buf + 1); + const __m256i xbl = yy_loadu_256(buf - 1 + stride); + const __m256i xb = yy_loadu_256(buf + stride); + const __m256i xbr = yy_loadu_256(buf + 1 + stride); + + const __m256i fours = _mm256_add_epi32( + xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x)))); + const __m256i threes = + _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); + + return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2), + threes); +} + +// The final filter for self-guided restoration. Computes a weighted average +// across A, B with "cross sums" (see cross_sum implementation above). +static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, const void *dgd8, + int dgd_stride, int width, int height, int highbd) { + const int nb = 5; + const __m256i rounding = + round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride); + const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding), + SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, + const int32_t *D, int width, int height, + int buf_stride, int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); + + const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m256i mask[8]; + for (int idx = 0; idx < 8; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); + mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; i += 2) { + for (int j = -1; j < width + 1; j += 8) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); + + if (idx < 8) { + sum1 = _mm256_and_si256(mask[idx], sum1); + sum2 = _mm256_and_si256(mask[idx], sum2); + } + + const __m256i p = compute_p(sum1, sum2, bit_depth, n); + + const __m256i z = _mm256_min_epi32( + _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm256_set1_epi32(255)); + + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); + + yy_storeu_256(A + i * buf_stride + j, a_res); + + const __m256i a_complement = + _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); + const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); + const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), + SGRPROJ_RECIP_BITS); + + yy_storeu_256(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 8 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xtl xt xtr +// - buf - +// xbl xb xbr +// +// Pixels are weighted like this: +// 5 6 5 +// 0 0 0 +// 5 6 5 +// +// fives = xtl + xtr + xbl + xbr +// sixes = xt + xb +// cross_sum = 6 * sixes + 5 * fives +// = 5 * (fives + sixes) - sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { + const __m256i xtl = yy_loadu_256(buf - 1 - stride); + const __m256i xt = yy_loadu_256(buf - stride); + const __m256i xtr = yy_loadu_256(buf + 1 - stride); + const __m256i xbl = yy_loadu_256(buf - 1 + stride); + const __m256i xb = yy_loadu_256(buf + stride); + const __m256i xbr = yy_loadu_256(buf + 1 + stride); + + const __m256i fives = + _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); + const __m256i sixes = _mm256_add_epi32(xt, xb); + const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); + + return _mm256_add_epi32( + _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), + fives_plus_sixes), + sixes); +} + +// Calculate 8 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xl x xr +// +// Pixels are weighted like this: +// 5 6 5 +// +// buf points to x +// +// fives = xl + xr +// sixes = x +// cross_sum = 5 * fives + 6 * sixes +// = 4 * (fives + sixes) + (fives + sixes) + sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) { + const __m256i xl = yy_loadu_256(buf - 1); + const __m256i x = yy_loadu_256(buf); + const __m256i xr = yy_loadu_256(buf + 1); + + const __m256i fives = _mm256_add_epi32(xl, xr); + const __m256i sixes = x; + + const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); + + return _mm256_add_epi32( + _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), + fives_plus_sixes), + sixes); +} + +// The final filter for the self-guided restoration. Computes a +// weighted average across A, B with "cross sums" (see cross_sum_... +// implementations above). +static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, + const void *dgd8, int dgd_stride, int width, + int height, int highbd) { + const int nb0 = 5; + const int nb1 = 4; + + const __m256i rounding0 = + round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + const __m256i rounding1 = + round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (int j = 0; j < width; j += 8) { + const __m256i a = + cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); + const __m256i b = + cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = + _mm256_srai_epi32(_mm256_add_epi32(v, rounding0), + SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } else { // odd row + for (int j = 0; j < width; j += 8) { + const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j); + const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = + _mm256_srai_epi32(_mm256_add_epi32(v, rounding1), + SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } + } +} + +int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, + // Ctl and Dtl is 32-byte aligned. + const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); + + int32_t *buf = aom_memalign( + 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); + if (!buf) return -1; + + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 32 bytes for efficiency. + int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3); + + // The "tl" pointers point at the top-left of the initialised data for the + // array. + int32_t *Atl = buf + 0 * buf_elts + 7; + int32_t *Btl = buf + 1 * buf_elts + 7; + int32_t *Ctl = buf + 2 * buf_elts + 7; + int32_t *Dtl = buf + 3 * buf_elts + 7; + + // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note + // there's a zero row and column in A, B (integral images), so we move down + // and right one for them. + const int buf_diag_border = + SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; + + int32_t *A0 = Atl + 1 + buf_stride; + int32_t *B0 = Btl + 1 + buf_stride; + int32_t *C0 = Ctl + 1 + buf_stride; + int32_t *D0 = Dtl + 1 + buf_stride; + + // Finally, A, B, C, D point at position (0, 0). + int32_t *A = A0 + buf_diag_border; + int32_t *B = B0 + buf_diag_border; + int32_t *C = C0 + buf_diag_border; + int32_t *D = D0 + buf_diag_border; + + const int dgd_diag_border = + SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; + const uint8_t *dgd0 = dgd8 - dgd_diag_border; + + // Generate integral images from the input. C will contain sums of squares; D + // will contain just sums + if (highbd) + integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, + height_ext, Ctl, Dtl, buf_stride); + else + integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, + buf_stride); + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // Write to flt0 and flt1 + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + + if (params->r[0] > 0) { + calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, + sgr_params_idx, 0); + final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, + width, height, highbd); + } + + if (params->r[1] > 0) { + calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, + 1); + final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, + height, highbd); + } + aom_free(buf); + return 0; +} + +int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + const int ret = av1_selfguided_restoration_avx2( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + if (ret != 0) return ret; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + + __m256i xq0 = _mm256_set1_epi32(xq[0]); + __m256i xq1 = _mm256_set1_epi32(xq[1]); + + for (int i = 0; i < height; ++i) { + // Calculate output in batches of 16 pixels + for (int j = 0; j < width; j += 16) { + const int k = i * width + j; + const int m = i * dst_stride + j; + + const uint8_t *dat8ij = dat8 + i * stride + j; + __m256i ep_0, ep_1; + __m128i src_0, src_1; + if (highbd) { + src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); + src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8)); + ep_0 = _mm256_cvtepu16_epi32(src_0); + ep_1 = _mm256_cvtepu16_epi32(src_1); + } else { + src_0 = xx_loadu_128(dat8ij); + ep_0 = _mm256_cvtepu8_epi32(src_0); + ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8)); + } + + const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS); + const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS); + + __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS); + __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0); + v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0)); + + const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1); + v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1)); + } + + if (params->r[1] > 0) { + const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0); + v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0)); + + const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1); + v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1)); + } + + const __m256i rounding = + round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m256i w_0 = _mm256_srai_epi32( + _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m256i w_1 = _mm256_srai_epi32( + _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + if (highbd) { + // Pack into 16 bits and clamp to [0, 2^bit_depth) + // Note that packing into 16 bits messes up the order of the bits, + // so we use a permute function to correct this + const __m256i tmp = _mm256_packus_epi32(w_0, w_1); + const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); + const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1); + const __m256i res = _mm256_min_epi16(tmp2, max); + yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res); + } else { + // Pack into 8 bits and clamp to [0, 256) + // Note that each pack messes up the order of the bits, + // so we use a permute function to correct this + const __m256i tmp = _mm256_packs_epi32(w_0, w_1); + const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); + const __m256i res = + _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */); + const __m128i res2 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8)); + xx_storeu_128(dst8 + m, res2); + } + } + } + return 0; +} diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c new file mode 100644 index 0000000000..ac850f5691 --- /dev/null +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" + +// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to +// 32-bit precision and return them in an SSE register. +static __m128i xx_load_extend_8_32(const void *p) { + return _mm_cvtepu8_epi32(xx_loadl_32(p)); +} + +// Load 4 halfwords from the possibly-misaligned pointer p, extend each +// halfword to 32-bit precision and return them in an SSE register. +static __m128i xx_load_extend_16_32(const void *p) { + return _mm_cvtepu16_epi32(xx_loadl_64(p)); +} + +// Compute the scan of an SSE register holding 4 32-bit integers. If the +// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2, +// x0+x1+x2+x3 +static __m128i scan_32(__m128i x) { + const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4)); + return _mm_add_epi32(x01, _mm_slli_si128(x01, 8)); +} + +// Compute two integral images from src. B sums elements; A sums their +// squares. The images are offset by one pixel, so will have width and height +// equal to width + 1, height + 1 and the first row and column will be zero. +// +// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple +// of 4. +static void integral_images(const uint8_t *src, int src_stride, int width, + int height, int32_t *A, int32_t *B, + int buf_stride) { + // Write out the zero top row + memset(A, 0, sizeof(*A) * (width + 1)); + memset(B, 0, sizeof(*B) * (width + 1)); + + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the four lanes. + __m128i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 4) { + const int ABj = 1 + j; + + const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); + const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); + + const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride); + const __m128i x2 = _mm_madd_epi16(x1, x1); + + const __m128i sc1 = scan_32(x1); + const __m128i sc2 = scan_32(x2); + + const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); + const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); + + xx_store_128(B + ABj + (i + 1) * buf_stride, row1); + xx_store_128(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); + ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); + } + } +} + +// Compute two integral images from src. B sums elements; A sums their squares +// +// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4. +static void integral_images_highbd(const uint16_t *src, int src_stride, + int width, int height, int32_t *A, + int32_t *B, int buf_stride) { + // Write out the zero top row + memset(A, 0, sizeof(*A) * (width + 1)); + memset(B, 0, sizeof(*B) * (width + 1)); + + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the four lanes. + __m128i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 4) { + const int ABj = 1 + j; + + const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); + const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); + + const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride); + const __m128i x2 = _mm_madd_epi16(x1, x1); + + const __m128i sc1 = scan_32(x1); + const __m128i sc2 = scan_32(x2); + + const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); + const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); + + xx_store_128(B + ABj + (i + 1) * buf_stride, row1); + xx_store_128(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); + ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); + } + } +} + +// Compute 4 values of boxsum from the given integral image. ii should point +// at the middle of the box (for the first value). r is the box radius. +static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) { + const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride); + const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride); + const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride); + const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride); + const __m128i u = _mm_sub_epi32(tr, tl); + const __m128i v = _mm_sub_epi32(br, bl); + return _mm_sub_epi32(v, u); +} + +static __m128i round_for_shift(unsigned shift) { + return _mm_set1_epi32((1 << shift) >> 1); +} + +static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) { + __m128i an, bb; + if (bit_depth > 8) { + const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8)); + const __m128i rounding_b = round_for_shift(bit_depth - 8); + const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); + const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a); + const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b); + // b < 2^14, so we can use a 16-bit madd rather than a 32-bit + // mullo to square it + bb = _mm_madd_epi16(b, b); + an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb); + } else { + bb = _mm_madd_epi16(sum1, sum1); + an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n)); + } + return _mm_sub_epi32(an, bb); +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, + int width, int height, int buf_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m128i s = _mm_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); + + const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m128i mask[4]; + for (int idx = 0; idx < 4; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); + mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; ++i) { + for (int j = -1; j < width + 1; j += 4) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(4, width + 1 - j); + assert(idx >= 1); + + if (idx < 4) { + sum1 = _mm_and_si128(mask[idx], sum1); + sum2 = _mm_and_si128(mask[idx], sum2); + } + + const __m128i p = compute_p(sum1, sum2, bit_depth, n); + + const __m128i z = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm_set1_epi32(255)); + + // 'Gather' type instructions are not available pre-AVX2, so synthesize a + // gather using scalar loads. + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); + + xx_storeu_128(A + i * buf_stride + j, a_res); + + const __m128i a_complement = + _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); + const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); + const __m128i b_res = + _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); + + xx_storeu_128(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter +// where the outer four corners have weight 3 and all other pixels have weight +// 4. +// +// Pixels are indexed like this: +// xtl xt xtr +// xl x xr +// xbl xb xbr +// +// buf points to x +// +// fours = xl + xt + xr + xb + x +// threes = xtl + xtr + xbr + xbl +// cross_sum = 4 * fours + 3 * threes +// = 4 * (fours + threes) - threes +// = (fours + threes) << 2 - threes +static INLINE __m128i cross_sum(const int32_t *buf, int stride) { + const __m128i xtl = xx_loadu_128(buf - 1 - stride); + const __m128i xt = xx_loadu_128(buf - stride); + const __m128i xtr = xx_loadu_128(buf + 1 - stride); + const __m128i xl = xx_loadu_128(buf - 1); + const __m128i x = xx_loadu_128(buf); + const __m128i xr = xx_loadu_128(buf + 1); + const __m128i xbl = xx_loadu_128(buf - 1 + stride); + const __m128i xb = xx_loadu_128(buf + stride); + const __m128i xbr = xx_loadu_128(buf + 1 + stride); + + const __m128i fours = _mm_add_epi32( + xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x)))); + const __m128i threes = + _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); + + return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes); +} + +// The final filter for self-guided restoration. Computes a weighted average +// across A, B with "cross sums" (see cross_sum implementation above). +static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, const void *dgd8, + int dgd_stride, int width, int height, int highbd) { + const int nb = 5; + const __m128i rounding = + round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride); + const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding), + SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, + const int32_t *D, int width, int height, + int buf_stride, int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m128i s = _mm_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); + + const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m128i mask[4]; + for (int idx = 0; idx < 4; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); + mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; i += 2) { + for (int j = -1; j < width + 1; j += 4) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(4, width + 1 - j); + assert(idx >= 1); + + if (idx < 4) { + sum1 = _mm_and_si128(mask[idx], sum1); + sum2 = _mm_and_si128(mask[idx], sum2); + } + + const __m128i p = compute_p(sum1, sum2, bit_depth, n); + + const __m128i z = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm_set1_epi32(255)); + + // 'Gather' type instructions are not available pre-AVX2, so synthesize a + // gather using scalar loads. + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); + + xx_storeu_128(A + i * buf_stride + j, a_res); + + const __m128i a_complement = + _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); + const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); + const __m128i b_res = + _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); + + xx_storeu_128(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 4 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xtl xt xtr +// - buf - +// xbl xb xbr +// +// Pixels are weighted like this: +// 5 6 5 +// 0 0 0 +// 5 6 5 +// +// fives = xtl + xtr + xbl + xbr +// sixes = xt + xb +// cross_sum = 6 * sixes + 5 * fives +// = 5 * (fives + sixes) - sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) { + const __m128i xtl = xx_loadu_128(buf - 1 - stride); + const __m128i xt = xx_loadu_128(buf - stride); + const __m128i xtr = xx_loadu_128(buf + 1 - stride); + const __m128i xbl = xx_loadu_128(buf - 1 + stride); + const __m128i xb = xx_loadu_128(buf + stride); + const __m128i xbr = xx_loadu_128(buf + 1 + stride); + + const __m128i fives = + _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); + const __m128i sixes = _mm_add_epi32(xt, xb); + const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); + + return _mm_add_epi32( + _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), + sixes); +} + +// Calculate 4 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xl x xr +// +// Pixels are weighted like this: +// 5 6 5 +// +// buf points to x +// +// fives = xl + xr +// sixes = x +// cross_sum = 5 * fives + 6 * sixes +// = 4 * (fives + sixes) + (fives + sixes) + sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) { + const __m128i xl = xx_loadu_128(buf - 1); + const __m128i x = xx_loadu_128(buf); + const __m128i xr = xx_loadu_128(buf + 1); + + const __m128i fives = _mm_add_epi32(xl, xr); + const __m128i sixes = x; + + const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); + + return _mm_add_epi32( + _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), + sixes); +} + +// The final filter for the self-guided restoration. Computes a +// weighted average across A, B with "cross sums" (see cross_sum_... +// implementations above). +static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, + const void *dgd8, int dgd_stride, int width, + int height, int highbd) { + const int nb0 = 5; + const int nb1 = 4; + + const __m128i rounding0 = + round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + const __m128i rounding1 = + round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (int j = 0; j < width; j += 4) { + const __m128i a = + cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); + const __m128i b = + cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0), + SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } else { // odd row + for (int j = 0; j < width; j += 4) { + const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j); + const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1), + SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } + } +} + +int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, + int height, int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + int32_t *buf = (int32_t *)aom_memalign( + 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + if (!buf) return -1; + memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes for efficiency. + int buf_stride = ((width_ext + 3) & ~3) + 16; + + // The "tl" pointers point at the top-left of the initialised data for the + // array. Adding 3 here ensures that column 1 is 16-byte aligned. + int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3; + + // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note + // there's a zero row and column in A, B (integral images), so we move down + // and right one for them. + const int buf_diag_border = + SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; + + int32_t *A0 = Atl + 1 + buf_stride; + int32_t *B0 = Btl + 1 + buf_stride; + int32_t *C0 = Ctl + 1 + buf_stride; + int32_t *D0 = Dtl + 1 + buf_stride; + + // Finally, A, B, C, D point at position (0, 0). + int32_t *A = A0 + buf_diag_border; + int32_t *B = B0 + buf_diag_border; + int32_t *C = C0 + buf_diag_border; + int32_t *D = D0 + buf_diag_border; + + const int dgd_diag_border = + SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; + const uint8_t *dgd0 = dgd8 - dgd_diag_border; + + // Generate integral images from the input. C will contain sums of squares; D + // will contain just sums + if (highbd) + integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, + height_ext, Ctl, Dtl, buf_stride); + else + integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, + buf_stride); + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // Write to flt0 and flt1 + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + + if (params->r[0] > 0) { + calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, + sgr_params_idx, 0); + final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, + width, height, highbd); + } + + if (params->r[1] > 0) { + calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, + 1); + final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, + height, highbd); + } + aom_free(buf); + return 0; +} + +int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + const int ret = av1_selfguided_restoration_sse4_1( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + if (ret != 0) return ret; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + + __m128i xq0 = _mm_set1_epi32(xq[0]); + __m128i xq1 = _mm_set1_epi32(xq[1]); + + for (int i = 0; i < height; ++i) { + // Calculate output in batches of 8 pixels + for (int j = 0; j < width; j += 8) { + const int k = i * width + j; + const int m = i * dst_stride + j; + + const uint8_t *dat8ij = dat8 + i * stride + j; + __m128i src; + if (highbd) { + src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); + } else { + src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij)); + } + + const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS); + const __m128i u_0 = _mm_cvtepu16_epi32(u); + const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8)); + + __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS); + __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0); + v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0)); + + const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1); + v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1)); + } + + if (params->r[1] > 0) { + const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0); + v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0)); + + const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1); + v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1)); + } + + const __m128i rounding = + round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding), + SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding), + SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + if (highbd) { + // Pack into 16 bits and clamp to [0, 2^bit_depth) + const __m128i tmp = _mm_packus_epi32(w_0, w_1); + const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1); + const __m128i res = _mm_min_epi16(tmp, max); + xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res); + } else { + // Pack into 8 bits and clamp to [0, 256) + const __m128i tmp = _mm_packs_epi32(w_0, w_1); + const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */); + xx_storel_64(dst8 + m, res); + } + } + } + return 0; +} diff --git a/third_party/aom/av1/common/x86/warp_plane_avx2.c b/third_party/aom/av1/common/x86/warp_plane_avx2.c new file mode 100644 index 0000000000..663b8cde93 --- /dev/null +++ b/third_party/aom/av1/common/x86/warp_plane_avx2.c @@ -0,0 +1,1210 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "aom_dsp/x86/synonyms.h" + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, + 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, + 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, + 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, + 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, + 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, + 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, + 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, + 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; + +static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, + __m256i *coeff, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift, int row) { + const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]); + const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]); + const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]); + const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]); + + const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]); + const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]); + const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]); + const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]); + + const __m256i res_even = _mm256_add_epi16(res_02, res_46); + const __m256i res_odd = _mm256_add_epi16(res_13, res_57); + const __m256i res = + _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const); + horz_out[row] = _mm256_srl_epi16(res, *shift); +} + +static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta, + int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0); + __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2); + __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1); + __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3); + + __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4); + __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6); + __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5); + __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7); + + __m128i tmp_8 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1); + + __m128i tmp_9 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1); + + __m128i tmp_10 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1); + + __m128i tmp_11 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1); + + tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1); + + tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1); + + tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1); + + tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1); + + const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256); + const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256); + const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256); + const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2); + tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3); + tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6); + tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0); + const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1); + const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4); + const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx, + __m256i *coeff) { + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]); + + const __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); + + coeff[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); + coeff[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); + coeff[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); + coeff[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); +} + +static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, + int sx, int alpha, int beta, int row, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift) { + __m256i coeff[4]; + prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff); + filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift, + row); +} +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m256i *coeff) { + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14)); + coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14)); + coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15)); + coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15)); +} + +static INLINE void warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, + round_const, shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)beta; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void unpack_weights_and_set_round_const_avx2( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) { + *res_sub_const = + _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((short)w0); + const __m256i wt1 = _mm256_set1_epi16((short)w1); + *wt = _mm256_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta, + int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m128i filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + __m256i filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + __m256i filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + __m256i filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00); + __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01); + __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02); + __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = _mm256_broadcastsi128_si256(filt_00); + filt_1 = _mm256_broadcastsi128_si256(filt_01); + filt_2 = _mm256_broadcastsi128_si256(filt_02); + filt_3 = _mm256_broadcastsi128_si256(filt_03); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy, + __m256i *coeffs) { + const __m128i filt_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + const __m128i filt_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS))); + + __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); + + coeffs[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); + coeffs[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); + coeffs[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); + coeffs[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); + + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out, + __m256i *src, + __m256i *coeffs, + __m256i *res_lo, + __m256i *res_hi, int row) { + const __m256i src_6 = horz_out[row + 3]; + const __m256i src_7 = + _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21); + + src[6] = _mm256_unpacklo_epi16(src_6, src_7); + + const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]); + const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]); + const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]); + + const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2), + _mm256_add_epi32(res_4, res_6)); + + src[7] = _mm256_unpackhi_epi16(src_6, src_7); + + const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]); + const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]); + const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]); + const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]); + + const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3), + _mm256_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm256_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output_avx2( + const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, + const __m256i *wt, const __m256i *res_sub_const, + const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, + int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m256i res_lo_1 = *res_lo; + __m256i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p_0 = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j]; + + res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + + const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1); + __m256i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const dst8_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + const __m128i p_16_0 = _mm_loadl_epi64(p_0); + const __m128i p_16_1 = _mm_loadl_epi64(p_1); + const __m256i p_16 = + _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16); + const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1); + } + res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const); + res_lo_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits); + const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16); + const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo); + const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1); + *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0); + *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1); + } else { + const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16); + const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1); + _mm_storel_epi64(p_0, temp_lo_16_0); + _mm_storel_epi64(p_1, temp_lo_16_1); + } + if (p_width > 4) { + __m128i *const p4_0 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + __m128i *const p4_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1); + __m256i res_hi_16; + if (conv_params->do_average) { + __m128i *const dst8_4_0 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i *const dst8_4_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4]; + const __m128i p4_16_0 = _mm_loadl_epi64(p4_0); + const __m128i p4_16_1 = _mm_loadl_epi64(p4_1); + const __m256i p4_16 = _mm256_inserti128_si256( + _mm256_castsi128_si256(p4_16_0), p4_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16); + const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const); + res_hi_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits); + __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16); + const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi); + const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1); + *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0); + *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1); + } else { + const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16); + const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1); + _mm_storel_epi64(p4_0, temp_hi_16_0); + _mm_storel_epi64(p4_1, temp_hi_16_1); + } + } + } else { + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit); + const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit); + const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + + if (p_width == 4) { + *(int *)p = _mm_cvtsi128_si32(res_8bit0); + *(int *)p1 = _mm_cvtsi128_si32(res_8bit1); + } else { + _mm_storel_epi64(p, res_8bit0); + _mm_storel_epi64(p1, res_8bit1); + } + } +} + +static INLINE void warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)delta; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void prepare_warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else + warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta, + p_height, p_stride, p_width, i, j, sy4, + reduce_bits_vert, res_add_const, round_bits, + res_sub_const, round_bits_const, wt); +} + +static INLINE void prepare_warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else + warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, round_const, shift, + shuffle_src); +} + +void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m256i horz_out[8]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m256i reduce_bits_vert_const = + _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + const __m256i round_const = _mm256_set1_epi16( + (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); + const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz); + + __m256i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, + &wt); + + __m256i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + const int32_t const1 = alpha * (-4) + beta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const2 = gamma * (-4) + delta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1); + const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)); + const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); + + __m256i shuffle_src[4]; + shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); + shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); + shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); + shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += const1; + sy4 += const2; + + sx4 &= ~const3; + sy4 &= ~const3; + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + + if (ix4 <= -7) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + } else if (ix4 >= width + 6) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = + _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5); + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + int iy, sx, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_left); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_right); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, + shuffle_src, &round_const, &shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = + _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = _mm256_castsi128_si256(src); + __m256i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, + &round_const, &shift, row); + } else { + prepare_warp_horizontal_filter_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, + i, &round_const, &shift, shuffle_src); + } + + // Vertical filter + prepare_warp_vertical_filter_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, + p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, + &res_sub_const, &round_bits_const, &wt); + } + } +} diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c new file mode 100644 index 0000000000..4c05555ff7 --- /dev/null +++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +/* This is a modified version of 'av1_warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, +}; +/* clang-format on */ + +// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 +// in an SSE register into two sequences: +// 0, 2, 2, 4, ..., 12, 12, 14, +// 1, 3, 3, 5, ..., 13, 13, 15, +DECLARE_ALIGNED(16, static const uint8_t, + even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, + 9, 11, 11, 13, 13, 15, 15, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15 }; + +static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, + const int reduce_bits_horiz, int k) { + const __m128i src_even = + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); + const __m128i src_odd = + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); + // The pixel order we need for 'src' is: + // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 + const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); + const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); + // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 + const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), + _mm_srli_si128(src_odd, 4)); + const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); + // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 + const __m128i src_13 = + _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); + const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); + // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 + const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), + _mm_srli_si128(src_even, 6)); + const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); + + const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + // Note: The values res_02 + res_46 and res_13 + res_57 both + // fit into int16s at this point, but their sum may be too wide to fit + // into an int16. However, once we also add round_const, the sum of + // all of these fits into a uint16. + // + // The wrapping behaviour of _mm_add_* is used here to make sure we + // get the correct result despite converting between different + // (implicit) types. + const __m128i res_even = _mm_add_epi16(res_02, res_46); + const __m128i res_odd = _mm_add_epi16(res_13, res_57); + const __m128i res = + _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); + tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +} + +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); +} + +static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, + int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); +} + +static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, + int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, + int p_height, int height, int i, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void warp_horizontal_filter_alpha0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_alpha0_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void unpack_weights_and_set_round_const( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { + *res_sub_const = + _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16((int16_t)w0); + const __m128i wt1 = _mm_set1_epi16((int16_t)w1); + *wt = _mm_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, + __m128i *coeffs) { + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // even coeffs + coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + // odd coeffs + coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + // even coeffs + coeffs[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); + coeffs[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); + coeffs[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); + coeffs[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); + + // odd coeffs + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, + __m128i *res_lo, __m128i *res_hi, + int k) { + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + + const __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); + + const __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output( + __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, + const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, + uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, + const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m128i res_lo_1 = *res_lo; + __m128i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); + __m128i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + const __m128i p_16 = _mm_loadl_epi64(p); + + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); + } + + res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); + + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), + round_bits); + __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); + *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo); + } else { + _mm_storel_epi64(p, temp_lo_16); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); + __m128i res_hi_16; + + if (conv_params->do_average) { + __m128i *const dst8_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + const __m128i p4_16 = _mm_loadl_epi64(p4); + + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); + const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); + + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), + round_bits); + __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); + *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); + + } else { + _mm_storel_epi64(p4, temp_hi_16); + } + } + } else { + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + *(int *)p = _mm_cvtsi128_si32(res_8bit); + } else { + _mm_storel_epi64(p, res_8bit); + } + } +} + +static INLINE void warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + (void)gamma; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + (void)gamma; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void prepare_warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, + sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else + warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); +} + +static INLINE void prepare_warp_horizontal_filter( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else + warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + +void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m128i reduce_bits_vert_const = + _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + __m128i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } + } else { + prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + } + + // Vertical filter + prepare_warp_vertical_filter( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, + j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); + } + } +} diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c new file mode 100644 index 0000000000..3de630f203 --- /dev/null +++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// 128-bit xmmwords are written as [ ... ] with the MSB on the left. +// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB +// on the left. +// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be +// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. + +// Exploiting the range of wiener filter coefficients, +// horizontal filtering can be done in 16 bit intermediate precision. +// The details are as follows : +// Consider the horizontal wiener filter coefficients of the following form : +// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] +// Subtracting 2^(FILTER_BITS) from the centre tap we get the following : +// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] +// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 +// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit +// precision. Finally, after rounding the above result by round_0, we multiply +// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the +// horizontal filter output. + +void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]); + int im_h = h + SUBPEL_TAPS - 2; + int im_stride = 8; + memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = (SUBPEL_TAPS - 1) / 2; + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center; + + assert(conv_params->round_0 > 0); + + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2); + + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x); + const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_h[0] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_h[1] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_h[2] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_h[3] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu)); + + const __m256i round_const_h = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1))); + const __m256i round_const_horz = + _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1))); + const __m256i clamp_low = _mm256_setzero_si256(); + const __m256i clamp_high = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0); + + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i zero_128 = _mm_setzero_si128(); + const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0); + + const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff); + + const __m256i round_const_v = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + for (j = 0; j < w; j += 8) { + for (i = 0; i < im_h; i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, + _mm_loadu_si128( + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), + 1); + + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); + + res = + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); + + __m256i data_0 = _mm256_shuffle_epi8(data, filt_center); + + // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to + // the result + data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0); + res = _mm256_add_epi16(res, data_0); + res = _mm256_add_epi16(res, round_const_horz); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped); + } + + /* Vertical filter */ + { + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + __m256i s[8]; + s[0] = _mm256_unpacklo_epi16(src_0, src_1); + s[1] = _mm256_unpacklo_epi16(src_2, src_3); + s[2] = _mm256_unpacklo_epi16(src_4, src_5); + + s[4] = _mm256_unpackhi_epi16(src_0, src_1); + s[5] = _mm256_unpackhi_epi16(src_2, src_3); + s[6] = _mm256_unpackhi_epi16(src_4, src_5); + + for (i = 0; i < h - 1; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + __m256i res_a = convolve(s, coeffs_v); + __m256i res_b = convolve(s + 4, coeffs_v); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + // 8 bit conversion and saturation to uint8 + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + // Store values into the destination buffer + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; + + _mm_storel_epi64(p_0, res_0); + _mm_storel_epi64(p_1, res_1); + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + if (h - i) { + s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20); + s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20); + s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20); + + const int16_t *data = &im_block[i * im_stride]; + const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride)); + const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + + __m128i s3 = _mm_unpacklo_epi16(s6_, s7_); + __m128i s7 = _mm_unpackhi_epi16(s6_, s7_); + + s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1); + __m256i convolveres = convolve(s, coeffs_v); + + const __m256i res_round = _mm256_sra_epi32( + _mm256_add_epi32(convolveres, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + __m128i reslo = _mm256_castsi256_si128(res_round); + __m128i reshi = _mm256_extracti128_si256(res_round, 1); + const __m128i res_16bit = _mm_packus_epi32(reslo, reshi); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit); + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p_0, res_8b); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c new file mode 100644 index 0000000000..1c039e80c6 --- /dev/null +++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + conv_params->round_0); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + conv_params->round_0); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16( + _mm_max_epi16(res, zero), + _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1)); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), conv_params->round_1); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), conv_params->round_1); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p, res_8bit); + } + } + } +} -- cgit v1.2.3