From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- .../aom/av1/encoder/arm/neon/encodetxb_neon.c | 646 +++++++++++++++++++++ 1 file changed, 646 insertions(+) create mode 100644 third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c (limited to 'third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c') diff --git a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c new file mode 100644 index 0000000000..582863a27c --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/encodetxb.h" + +void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + memset(levels - TX_PAD_TOP * stride, 0, + sizeof(*levels) * TX_PAD_TOP * stride); + memset(levels + stride * width, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + const int32x4_t zeros = vdupq_n_s32(0); + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const int32x4_t coeffA = vld1q_s32(cf); + const int32x4_t coeffB = vld1q_s32(cf + height); + const int16x8_t coeffAB = + vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const int8x8_t absABs = vqmovn_s16(absAB); +#if AOM_ARCH_AARCH64 + const int8x16_t absAB8 = + vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros))); + const uint8x16_t lsAB = + vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros)); +#else + const int32x2x2_t absAB8 = + vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros)); + const uint8x16_t lsAB = + vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1])); +#endif + vst1q_u8(ls, lsAB); + ls += (stride << 1); + cf += (height << 1); + i += 2; + } while (i < width); + } else if (height == 8) { + do { + const int16x8_t coeffAB = load_tran_low_to_s16q(cf); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8( + vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros)))); + vst1q_u8(ls, absAB8); + ls += stride; + cf += height; + i += 1; + } while (i < width); + } else { + do { + int j = 0; + do { + const int16x8_t coeffAB = load_tran_low_to_s16q(cf); + const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const int16x8_t absCD = vqabsq_s16(coeffCD); + const uint8x16_t absABCD = vreinterpretq_u8_s8( + vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD))); + vst1q_u8((ls + j), absABCD); + j += 16; + cf += 16; + } while (j < height); + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} + +// get_4_nz_map_contexts_2d coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = { + { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 }, + { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 } +}; + +// get_4_nz_map_contexts_hor coefficients: +/* clang-format off */ +#define SIG_COEF_CONTEXTS_2D_X4_051010 \ + (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \ + ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24)) +/* clang-format on */ + +// get_4_nz_map_contexts_ver coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// get_8_coeff_contexts_2d coefficients: +// if (width == 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = { + { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } +}; +// if (width < 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = { + { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 }, + { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 } +}; + +// if (width > 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = { + { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } +}; + +// get_4_nz_map_contexts_ver coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// get_16n_coeff_contexts_2d coefficients: +// real_width == real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = { + { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// real_width < real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = { + { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// real_width > real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = { + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// get_16n_coeff_contexts_hor coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// end of coefficients declaration area + +static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src, + const int byte_stride) { +#if AOM_ARCH_AARCH64 + uint32x4_t v_data = vld1q_u32((uint32_t *)src); + v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1); + v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2); + v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3); + + return vreinterpretq_u8_u32(v_data); +#else + return load_unaligned_u8q(src, byte_stride); +#endif +} + +static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src, + const int byte_stride) { +#if AOM_ARCH_AARCH64 + uint64x2_t v_data = vld1q_u64((uint64_t *)src); + v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1); + + return vreinterpretq_u8_u64(v_data); +#else + uint8x8_t v_data_low = vld1_u8(src); + uint8x8_t v_data_high = vld1_u8(src + byte_stride); + + return vcombine_u8(v_data_low, v_data_high); +#endif +} + +static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src, + const int byte_stride) { + (void)byte_stride; + return vld1q_u8(src); +} + +static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_4x4_to_1_reg(&src[1], stride); + level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride); + level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_8x2_to_1_reg(&src[1], stride); + level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride); + level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE void load_levels_16x1x5(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_16x1_to_1_reg(&src[1], stride); + level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride); + level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) { + const uint8x16_t const_3 = vdupq_n_u8(3); + const uint8x16_t const_4 = vdupq_n_u8(4); + uint8x16_t count; + + count = vminq_u8(level[0], const_3); + level[1] = vminq_u8(level[1], const_3); + level[2] = vminq_u8(level[2], const_3); + level[3] = vminq_u8(level[3], const_3); + level[4] = vminq_u8(level[4], const_3); + count = vaddq_u8(count, level[1]); + count = vaddq_u8(count, level[2]); + count = vaddq_u8(count, level[3]); + count = vaddq_u8(count, level[4]); + + count = vrshrq_n_u8(count, 1); + count = vminq_u8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(21); + + uint8x16_t pos_to_offset = + (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]); + + uint8x16_t count; + uint8x16_t level[5]; + uint8_t *cc = coeff_contexts; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + col -= 4; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + + const uint8x16_t pos_to_offset = + vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010)); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + uint8_t *cc = coeff_contexts; + uint8x16_t count; + uint8x16_t level[5]; + uint8x16_t pos_to_offset[3]; + + assert(!(width % 2)); + + if (width == 8) { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]); + } else if (width < 8) { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]); + } else { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]); + } + pos_to_offset[2] = vdupq_n_u8(21); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + col -= 2; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + + const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 2)); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0), + vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5)); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 2)); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + uint8_t *cc = coeff_contexts; + int col = width; + uint8x16_t pos_to_offset[5]; + uint8x16_t pos_to_offset_large[3]; + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + pos_to_offset_large[2] = vdupq_n_u8(21); + if (real_width == real_height) { + pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]); + pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]); + pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]); + pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width < real_height) { + pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]); + pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = + vld1q_u8(c_16_po_2d_g[2]); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width > real_height + pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]); + pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]); + pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16); + } + + do { + int h = height; + + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(cc, count); + levels += 16; + cc += 16; + h -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + int col = width; + do { + uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver); + + int h = height; + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + levels += TX_PAD_HOR; + } while (--col); +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + + uint8x16_t pos_to_offset[3]; + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + int col = width; + do { + int h = height; + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--col); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_neon(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + uint8_t *const coefficients = (uint8_t *const)coeff_contexts; + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = height + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (height == 4) { + get_4_nz_map_contexts_2d(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_2d(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coefficients); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (height == 4) { + get_4_nz_map_contexts_hor(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_hor(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients); + } + } else { // TX_CLASS_VERT + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (height == 4) { + get_4_nz_map_contexts_ver(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_ver(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients); + } + } + + const int bhl = get_txb_bhl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (width << bhl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (width << bhl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} -- cgit v1.2.3