diff options
Diffstat (limited to 'third_party/aom/av1/common')
-rw-r--r-- | third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c | 55 | ||||
-rw-r--r-- | third_party/aom/av1/common/arm/convolve_neon_dotprod.c | 49 | ||||
-rw-r--r-- | third_party/aom/av1/common/av1_rtcd_defs.pl | 7 | ||||
-rw-r--r-- | third_party/aom/av1/common/resize.c | 58 | ||||
-rw-r--r-- | third_party/aom/av1/common/resize.h | 10 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/resize_avx2.c | 411 |
6 files changed, 525 insertions, 65 deletions
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c index 3aeffbb0e6..40befdf44e 100644 --- a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c @@ -80,17 +80,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; - const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2)); // Dot product constants and other shims. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // Fold horiz_const into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const + - (1 << ((ROUND0_BITS - 1) - 1))); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; @@ -334,15 +332,14 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -455,15 +452,14 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -574,15 +570,14 @@ static INLINE void dist_wtd_convolve_x_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the vilter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c index c29229eb09..132da2442b 100644 --- a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c +++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c @@ -102,14 +102,12 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod( const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); - const int32_t correction_s32 = - vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)), - vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS)))); - // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right - // shift by FILTER_BITS - instead of a first rounding right shift by + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. - int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1))); + int32x4_t correction = + vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1))); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); @@ -274,16 +272,13 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, } const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - // Dot product constants. - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single - // rounding right shift by FILTER_BITS - instead of a first rounding right - // shift by ROUND0_BITS, followed by second rounding right shift by - // FILTER_BITS - ROUND0_BITS. - // The outermost -1 is needed because we will halve the filter values. + // Dot product constants: + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we will halve the filter values. const int32x4_t correction = - vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); if (w <= 4) { @@ -465,16 +460,13 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod( const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), vmovn_s16(x_filter_s16.val[1])); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // - which are generally faster than rounding shifts on modern CPUs. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Dot product constants. - const int32x4_t correct_tmp = - vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)), - vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7))); const int32x4_t correction = - vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const); + vdupq_n_s32((128 << FILTER_BITS) + horiz_const); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); @@ -621,16 +613,15 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { const int bd = 8; - // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding - // shifts - which are generally faster than rounding shifts on modern CPUs. - // The outermost -1 is needed because we halved the filter values. - const int32_t horiz_const = - ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // Dot product constants. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const); + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index c0831330d1..6a0043c761 100644 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -458,7 +458,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats"; - specialize qw/av1_compute_stats sse4_1 avx2 neon/; + specialize qw/av1_compute_stats sse4_1 avx2 neon sve/; add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; specialize qw/av1_calc_proj_params sse4_1 avx2 neon/; add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; @@ -469,7 +469,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/; add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/; - add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; + add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/; } } @@ -554,6 +554,9 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/; } +add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col"; +specialize qw/resize_vert_dir avx2/; + add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/; diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c index 441323ab1f..2b48b9fff4 100644 --- a/third_party/aom/av1/common/resize.c +++ b/third_party/aom/av1/common/resize.c @@ -18,6 +18,7 @@ #include <string.h> #include "config/aom_config.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" @@ -216,10 +217,6 @@ const int16_t av1_resize_filter_normative[( // Filters for interpolation (full-band) - no filtering for integer pixels #define filteredinterp_filters1000 av1_resize_filter_normative -// Filters for factor of 2 downsampling. -static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; -static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; - static const InterpKernel *choose_interp_filter(int in_length, int out_length) { int out_length16 = out_length * 16; if (out_length16 >= in_length * 16) @@ -524,6 +521,59 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { } } +bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int width2, int start_col) { + bool mem_status = true; + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2); + if (arrbuf == NULL || arrbuf2 == NULL) { + mem_status = false; + goto Error; + } + + for (int i = start_col; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + down2_symeven(arrbuf, height, arrbuf2); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(arrbuf); + aom_free(arrbuf2); + return mem_status; +} + +void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf, + int height, int filtered_length, int width2) { + for (int i = 0; i < height; ++i) + down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i); +} + +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride) { + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height); + if (intbuf == NULL) { + return false; + } + + // Resize in the horizontal direction + resize_horz_dir(input, in_stride, intbuf, height, width, width2); + // Resize in the vertical direction + bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2, + width2, 0 /*start_col*/); + aom_free(intbuf); + return mem_status; +} + +// Check if both the output width and height are half of input width and +// height respectively. +bool should_resize_by_half(int height, int width, int height2, int width2) { + const bool is_width_by_2 = get_down2_length(width, 1) == width2; + const bool is_height_by_2 = get_down2_length(height, 1) == height2; + return (is_width_by_2 && is_height_by_2); +} + bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h index d573a538bf..de71f5d539 100644 --- a/third_party/aom/av1/common/resize.h +++ b/third_party/aom/av1/common/resize.h @@ -20,6 +20,10 @@ extern "C" { #endif +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); @@ -93,6 +97,12 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, bool alloc_pyramid); +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride); + +bool should_resize_by_half(int height, int width, int height2, int width2); + // Returns 1 if a superres upscaled frame is scaled and 0 otherwise. static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling diff --git a/third_party/aom/av1/common/x86/resize_avx2.c b/third_party/aom/av1/common/x86/resize_avx2.c new file mode 100644 index 0000000000..c44edb88d9 --- /dev/null +++ b/third_party/aom/av1/common/x86/resize_avx2.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> +#include <string.h> + +#include "config/av1_rtcd.h" + +#include "av1/common/resize.h" + +#include "aom_dsp/x86/synonyms.h" + +#define CAST_HI(x) _mm256_castsi128_si256(x) +#define CAST_LOW(x) _mm256_castsi256_si128(x) + +#define PROCESS_RESIZE_Y_WD16 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \ + \ + /* g0... g15 | i0... i15 */ \ + const __m256i s68 = \ + _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \ + /* h0... h15 | j0... j15 */ \ + const __m256i s79 = \ + _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \ + \ + /* g0h0... g7g7 | i0j0... i7j */ \ + s[3] = _mm256_unpacklo_epi8(s68, s79); \ + /* g8h8... g15g15 | i8j8... i15j15 */ \ + s[8] = _mm256_unpackhi_epi8(s68, s79); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_y_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20... r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + __m256i res_out_b[2] = { 0 }; \ + resize_y_convolve(s + 5, coeffs_y, res_out_b); \ + \ + /* r08... r015 */ \ + __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \ + /* r28... r215 */ \ + __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \ + res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \ + res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \ + \ + /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \ + __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \ + __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \ + /* r00... r07 | r20... r27 */ \ + res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \ + /* r08... r015 | r28... r215 */ \ + res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \ + /* r00... r015 | r20... r215 */ \ + res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \ + res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \ + res_8bit0 = _mm256_max_epu8(res_8bit0, zero); + +#define PROCESS_RESIZE_Y_WD8 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \ + \ + /* g0h0... g7h7 */ \ + s67 = _mm_unpacklo_epi8(l6, l7); \ + /* i0j0...i7j7 */ \ + __m128i s89 = _mm_unpacklo_epi8(l8, l9); \ + \ + /* g0h0...g7g7 | i0j0...i7j7 */ \ + s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_y_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20...r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \ + res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r00...r07 | r20...r27 */ \ + res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \ + res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \ + res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \ + res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero); + +static INLINE void resize_y_convolve(const __m256i *const s, + const __m256i *const coeffs, + __m256i *res_out) { + const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + const __m256i dst_0 = _mm256_add_epi16(res_0, res_1); + const __m256i dst_1 = _mm256_add_epi16(res_2, res_3); + // The sum of convolve operation crosses signed 16bit. Hence, the addition + // should happen in 32bit. + const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0)); + const __m256i dst_01 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1)); + const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1)); + const __m256i dst_11 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1)); + + res_out[0] = _mm256_add_epi32(dst_00, dst_10); + res_out[1] = _mm256_add_epi32(dst_01, dst_11); +} + +static INLINE void prepare_filter_coeffs(const int16_t *filter, + __m256i *const coeffs /* [4] */) { + // f0 f1 f2 f3 x x x x + const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); + // f0 f1 f2 f3 f0 f1 f2 f3 + const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44); + // f0 f1 f2 f3 f1 f0 f3 f2 + const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1); + + const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1); + + // f0 f1 f0 f1 .. + coeffs[2] = _mm256_broadcastw_epi16(filter_8bit); + // f2 f3 f2 f3 .. + coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2)); + // f3 f2 f3 f2 .. + coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6)); + // f1 f0 f1 f0 .. + coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4)); +} + +bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int stride, int start_col) { + assert(start_col <= stride); + // For the GM tool, the input layer height or width is assured to be an even + // number. Hence the function 'down2_symodd()' is not invoked and SIMD + // optimization of the same is not implemented. + // When the input height is less than 8 and even, the potential input + // heights are limited to 2, 4, or 6. These scenarios require seperate + // handling due to padding requirements. Invoking the C function here will + // eliminate the need for conditional statements within the subsequent SIMD + // code to manage these cases. + if (height & 1 || height < 8) { + return resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, start_col); + } + + __m256i s[10], coeffs_y[4]; + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const uint8_t max_pixel = 255; + const __m256i clip_pixel = _mm256_set1_epi8(max_pixel); + const __m256i zero = _mm256_setzero_si256(); + + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); + + const int num_col16 = stride / 16; + int remain_col = stride % 16; + // The core vertical SIMD processes 4 input rows simultaneously to generate + // output corresponding to 2 rows. To streamline the core loop and eliminate + // the need for conditional checks, the remaining rows (4 or 6) are processed + // separately. + const int remain_row = (height % 4 == 0) ? 4 : 6; + + for (int j = start_col; j < stride - remain_col; j += 16) { + const uint8_t *data = &intbuf[j]; + const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with the last available row at the top. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride)); + + // a0...a15 | c0...c15 + const __m256i s02 = + _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20); + // b0...b15 | d0...d15 + const __m256i s13 = + _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20); + // c0...c15 | e0...e15 + const __m256i s24 = + _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20); + // d0...d15 | f0...f15 + const __m256i s35 = + _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20); + // e0...e15 | g0...g15 + const __m256i s46 = + _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20); + // f0...f15 | h0...h15 + const __m256i s57 = + _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_unpacklo_epi8(s02, s13); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_unpacklo_epi8(s24, s35); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_unpacklo_epi8(s46, s57); + + // a8b8...a15b15 | c8d8...c15d15 + s[5] = _mm256_unpackhi_epi8(s02, s13); + // c8d8...c15d15 | e8f8...e15f15 + s[6] = _mm256_unpackhi_epi8(s24, s35); + // e8f8...e15f15 | g8h8...g15h15 + s[7] = _mm256_unpackhi_epi8(s46, s57); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + + _mm_storeu_si128( + (__m128i *)&output[(i / 2) * out_stride + j + out_stride], + _mm256_extracti128_si256(res_8bit0, 1)); + + // Load the required data for processing of next 4 input rows. + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride)); + + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + _mm256_extracti128_si256(res_8bit0, 1)); + i += 2; + + // Check if there is any remaining height to process. If so, perform the + // necessary data loading for processing the next row. + if (i < height - 1) { + l10 = l11 = l9; + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + } + } + + if (remain_col > 7) { + const int processed_wd = num_col16 * 16; + remain_col = stride % 8; + + const uint8_t *data = &intbuf[processed_wd]; + + const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with available top-most row. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride)); + + // a0b0...a7b7 + const __m128i s01 = _mm_unpacklo_epi8(l0, l1); + // c0d0...c7d7 + const __m128i s23 = _mm_unpacklo_epi8(l2, l3); + // e0f0...e7f7 + const __m128i s45 = _mm_unpacklo_epi8(l4, l5); + // g0h0...g7h7 + __m128i s67 = _mm_unpacklo_epi8(l10, l11); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride], + _mm256_extracti128_si256(res_a_round_1, 1)); + + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride)); + + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd], + _mm256_extracti128_si256(res_a_round_1, 1)); + i += 2; + + // Check rows are still remaining for processing. If yes do the required + // load of data for the next iteration. + if (i < height - 1) { + l10 = l11 = l9; + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + } + } + + if (remain_col) + return resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, stride - remain_col); + + return true; +} |