diff options
Diffstat (limited to '')
43 files changed, 2581 insertions, 819 deletions
diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h index d5f0c087e6..68fb312222 100644 --- a/third_party/aom/aom/aom_image.h +++ b/third_party/aom/aom/aom_image.h @@ -103,7 +103,8 @@ typedef enum aom_transfer_characteristics { AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ -} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */ +} aom_transfer_characteristics_t; /**< alias for enum + aom_transfer_characteristics */ /*!\brief List of supported matrix coefficients */ typedef enum aom_matrix_coefficients { @@ -125,7 +126,7 @@ typedef enum aom_matrix_coefficients { AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ -} aom_matrix_coefficients_t; +} aom_matrix_coefficients_t; /**< alias for enum aom_matrix_coefficients */ /*!\brief List of supported color range */ typedef enum aom_color_range { @@ -144,7 +145,8 @@ typedef enum aom_chroma_sample_position { /**< sample, between two vertical samples */ AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ AOM_CSP_RESERVED = 3 /**< Reserved value */ -} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */ +} aom_chroma_sample_position_t; /**< alias for enum aom_chroma_sample_position + */ /*!\brief List of insert flags for Metadata * @@ -244,10 +246,13 @@ typedef struct aom_image { * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be @@ -267,10 +272,12 @@ aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of each row in the image - * (stride). + * (stride). Must not exceed 65536. * \param[in] img_data Storage to use for the image * * \return Returns a pointer to the initialized image descriptor. If the img @@ -291,12 +298,17 @@ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * \param[in] size_align Alignment, in pixels, of the image width and height. + * Must not exceed 65536. * \param[in] border A border that is padded on four sides of the image. + * Must not exceed 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be diff --git a/third_party/aom/aom/src/aom_image.c b/third_party/aom/aom/src/aom_image.c index 3b1c33d056..1d3b7df245 100644 --- a/third_party/aom/aom/src/aom_image.c +++ b/third_party/aom/aom/src/aom_image.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <assert.h> #include <limits.h> #include <stdlib.h> #include <string.h> @@ -36,13 +37,20 @@ static aom_image_t *img_alloc_helper( /* NOTE: In this function, bit_depth is either 8 or 16 (if * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. */ - unsigned int h, w, s, xcs, ycs, bps, bit_depth; - unsigned int stride_in_bytes; + unsigned int xcs, ycs, bps, bit_depth; if (img != NULL) memset(img, 0, sizeof(aom_image_t)); if (fmt == AOM_IMG_FMT_NONE) goto fail; + /* Impose maximum values on input parameters so that this function can + * perform arithmetic operations without worrying about overflows. + */ + if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 || + stride_align > 65536 || size_align > 65536 || border > 65536) { + goto fail; + } + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -105,12 +113,17 @@ static aom_image_t *img_alloc_helper( } /* Calculate storage sizes given the chroma subsampling */ - w = align_image_dimension(d_w, xcs, size_align); - h = align_image_dimension(d_h, ycs, size_align); - - s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth; - s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1); - stride_in_bytes = s * bit_depth / 8; + const unsigned int w = align_image_dimension(d_w, xcs, size_align); + assert(d_w <= w); + const unsigned int h = align_image_dimension(d_h, ycs, size_align); + assert(d_h <= h); + + uint64_t s = (uint64_t)w + 2 * border; + s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth; + s = s * bit_depth / 8; + s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); + if (s > INT_MAX) goto fail; + const int stride_in_bytes = (int)s; /* Allocate the new image */ if (!img) { @@ -232,7 +245,7 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, img->planes[AOM_PLANE_Y] = data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; - data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y]; + data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y]; unsigned int uv_border_h = border >> img->y_chroma_shift; unsigned int uv_x = x >> img->x_chroma_shift; @@ -244,14 +257,14 @@ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; } else { img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_V]; img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; @@ -291,15 +304,15 @@ void aom_img_free(aom_image_t *img) { } int aom_img_plane_width(const aom_image_t *img, int plane) { - if (plane > 0 && img->x_chroma_shift > 0) - return (img->d_w + 1) >> img->x_chroma_shift; + if (plane > 0) + return (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift; else return img->d_w; } int aom_img_plane_height(const aom_image_t *img, int plane) { - if (plane > 0 && img->y_chroma_shift > 0) - return (img->d_h + 1) >> img->y_chroma_shift; + if (plane > 0) + return (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift; else return img->d_h; } diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index de987cbd23..27099d36b2 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -205,6 +205,9 @@ if(CONFIG_AV1_ENCODER) list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_sve.c") endif() list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 7e746e9cb9..b75bdc5a19 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -1799,7 +1799,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_compute_correlation sse4_1 avx2/; add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; - specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon/; + specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon sve/; } } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c index 7441108b01..6a177b2e6b 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c @@ -20,6 +20,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" @@ -31,14 +32,14 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); - int16x4_t sum; - sum = vmul_lane_s16(s0, filter_lo, 0); + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); sum = vmla_lane_s16(sum, s1, filter_lo, 1); sum = vmla_lane_s16(sum, s2, filter_lo, 2); sum = vmla_lane_s16(sum, s5, filter_hi, 1); sum = vmla_lane_s16(sum, s6, filter_hi, 2); sum = vmla_lane_s16(sum, s7, filter_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); return sum; @@ -51,65 +52,56 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); - int16x8_t sum; - sum = vmulq_lane_s16(s0, filter_lo, 0); + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); return vqrshrun_n_s16(sum, FILTER_BITS); } -void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { +static INLINE void convolve8_horiz_8tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { const int16x8_t filter = vld1q_s16(filter_x); - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - - (void)x_step_q4; - (void)filter_y; - (void)y_step_q4; - - src -= ((SUBPEL_TAPS / 2) - 1); - if (h == 4) { - uint8x8_t t0, t1, t2, t3, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); src += 7; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_elems_inplace_u8_4x4(&d01, &d23); @@ -123,39 +115,40 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4; dst += 4; w -= 4; } while (w != 0); } else { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); @@ -169,48 +162,49 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 8; } while (h > 0); } else { - uint8x8_t d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - int width; - const uint8_t *s; - uint8_t *d; - do { - load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - width = w; - s = src + 7; - d = dst; + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -224,6 +218,7 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s12; s5 = s13; s6 = s14; + s += 8; d += 8; width -= 8; @@ -236,6 +231,137 @@ void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filter) { + int16x4_t sum = vmul_lane_s16(s0, filter, 0); + sum = vmla_lane_s16(sum, s1, filter, 1); + sum = vmla_lane_s16(sum, s2, filter, 2); + sum = vmla_lane_s16(sum, s3, filter, 3); + + return sum; +} + +static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x4_t filter) { + int16x8_t sum = vmulq_lane_s16(s0, filter, 0); + sum = vmlaq_lane_s16(sum, s1, filter, 1); + sum = vmlaq_lane_s16(sum, s2, filter, 2); + sum = vmlaq_lane_s16(sum, s3, filter, 3); + + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int w, + int h) { + // All filter values are even, halve to reduce intermediate precision + // requirements. + const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); + + if (w == 4) { + do { + int16x8_t t0 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 0 * src_stride))); + int16x8_t t1 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + 1 * src_stride))); + + int16x4_t s0[4], s1[4]; + s0[0] = vget_low_s16(t0); + s0[1] = vget_low_s16(vextq_s16(t0, t0, 1)); + s0[2] = vget_low_s16(vextq_s16(t0, t0, 2)); + s0[3] = vget_low_s16(vextq_s16(t0, t0, 3)); + + s1[0] = vget_low_s16(t1); + s1[1] = vget_low_s16(vextq_s16(t1, t1, 1)); + s1[2] = vget_low_s16(vextq_s16(t1, t1, 2)); + s1[3] = vget_low_s16(vextq_s16(t1, t1, 3)); + + int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter); + int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + int16x8_t t0 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride))); + int16x8_t t1 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride))); + + s += 8; + do { + int16x8_t t2 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 0 * src_stride))); + int16x8_t t3 = + vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 1 * src_stride))); + + int16x8_t s0[4], s1[4]; + s0[0] = t0; + s0[1] = vextq_s16(t0, t2, 1); + s0[2] = vextq_s16(t0, t2, 2); + s0[3] = vextq_s16(t0, t2, 3); + + s1[0] = t1; + s1[1] = vextq_s16(t1, t3, 1); + s1[2] = vextq_s16(t1, t3, 2); + s1[3] = vextq_s16(t1, t3, 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + + store_u8_8x2(d, dst_stride, d0, d1); + + t0 = t2; + t1 = t3; + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } +} + +void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + if (get_filter_taps_convolve8(filter_x) <= 4) { + convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w, + h); + } else { + convolve8_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h); + } +} + void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -253,33 +379,33 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); - s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); - s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); src += 7 * src_stride; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -291,42 +417,40 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int height; - const uint8_t *s; - uint8_t *d; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - height = h; - s = src + 7 * src_stride; - d = dst; + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int height = h; + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; do { load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -337,6 +461,7 @@ void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + s += 4 * src_stride; d += 4 * dst_stride; height -= 4; diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c index c82125ba17..120c479798 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c @@ -24,81 +24,72 @@ #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +// Filter values always sum to 128. +#define FILTER_WEIGHT 128 + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, - const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - int32x4_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1); - - /* Further narrowing and packing is performed by the caller. */ +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, - const int8x8_t filter, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); - sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); - sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -108,10 +99,6 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -123,19 +110,17 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1); if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -145,23 +130,20 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h > 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { - width = w; - s = src; - d = dst; + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -177,83 +159,88 @@ void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, } static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + int8x8_t a3, int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; + + *b = vreinterpretq_s8_s16(a0123); } static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8x2_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); + + *b0 = vreinterpretq_s8_s16(a0123.val[0]); + *b1 = vreinterpretq_s8_s16(a0123.val[1]); } -static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filter) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum; +static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, samples_lo, filter, 0); - sum = vdotq_lane_s32(sum, samples_hi, filter, 1); + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filter) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0); - sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0); - sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + + // Accumulate into 128 * FILTER_WEIGHT to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -263,10 +250,7 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); int8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -279,62 +263,58 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. int8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567, s5678, s6789, s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - int16x4_t d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - int16x4_t d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - int16x4_t d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - int16x4_t d3 = - convolve8_4_sdot_partial(s3456, s78910, correction, filter); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -345,8 +325,6 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - do { int height = h; const uint8_t *s = src; @@ -356,44 +334,38 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); @@ -406,19 +378,19 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - uint8x8_t d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, - s4567_hi, correction, filter); - uint8x8_t d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, - s5678_hi, correction, filter); - uint8x8_t d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, - s6789_hi, correction, filter); - uint8x8_t d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, - s78910_hi, correction, filter); + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c index df6e4d2ab5..68e031461d 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c @@ -23,69 +23,60 @@ #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, - const int8x8_t filter, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - int32x4_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; - sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); - sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1); + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, - const int8x8_t filter, - const uint8x16x3_t permute_tbl) { - uint8x16_t permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); - sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0); - sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -95,7 +86,6 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -107,19 +97,17 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1); if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; - + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filter, perm_tbl); - t1 = convolve8_4_usdot(s1, filter, perm_tbl); - t2 = convolve8_4_usdot(s2, filter, perm_tbl); - t3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -129,23 +117,20 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h > 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { - width = w; - s = src; - d = dst; + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -162,79 +147,83 @@ void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; + + *b = vreinterpretq_u8_u16(a0123); } static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8x2_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); + + *b0 = vreinterpretq_u8_u16(a0123.val[0]); + *b1 = vreinterpretq_u8_u16(a0123.val[1]); } -static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filter) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum; - - sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0); - sum = vusdotq_lane_s32(sum, samples_hi, filter, 1); +static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - /* Further narrowing and packing is performed by the caller. */ + // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, - const uint8x16_t samples0_hi, - const uint8x16_t samples1_lo, - const uint8x16_t samples1_hi, - const int8x8_t filter) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0); - sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0); - sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); +static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + + // First 4 output values. + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } @@ -244,7 +233,7 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -257,47 +246,44 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456; - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s4567, s5678, s6789, s78910; - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - int16x4_t d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - int16x4_t d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - int16x4_t d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - int16x4_t d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -308,8 +294,6 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - do { int height = h; const uint8_t *s = src; @@ -319,19 +303,14 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; @@ -339,10 +318,9 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ + // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); @@ -355,19 +333,19 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - uint8x8_t d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, - s4567_hi, filter); - uint8x8_t d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, - s5678_hi, filter); - uint8x8_t d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, - s6789_hi, filter); - uint8x8_t d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, - s78910_hi, filter); + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c index 62729133e3..5758d2887f 100644 --- a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c @@ -16,36 +16,10 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/flow_estimation/arm/disflow_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" -static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { - // Check that the fractional position is in range. - // - // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); - - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; -} - -static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between @@ -157,82 +131,6 @@ static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref, } } -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - - // Horizontal filter, using kernel {1, 0, -1}. - const uint8_t *src_start = src - 1 * src_stride - 1; - - for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { - uint8x16_t s = vld1q_u8(src_start + i * src_stride); - uint8x8_t s0 = vget_low_u8(s); - uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); - - // Given that the kernel is {1, 0, -1} the convolution is a simple - // subtraction. - int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); - - vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); - } - - // Vertical filter, using kernel {1, 2, 1}. - // This kernel can be split into two 2-taps kernels of value {1, 1}. - // That way we need only 3 add operations to perform the convolution, one of - // which can be reused for the next line. - int16x8_t s0 = vld1q_s16(tmp); - int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); - int16x8_t sum01 = vaddq_s16(s0, s1); - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); - - int16x8_t sum12 = vaddq_s16(s1, s2); - int16x8_t sum = vaddq_s16(sum01, sum12); - - vst1q_s16(dst + i * dst_stride, sum); - - sum01 = sum12; - s1 = s2; - } -} - -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - - // Horizontal filter, using kernel {1, 2, 1}. - // This kernel can be split into two 2-taps kernels of value {1, 1}. - // That way we need only 3 add operations to perform the convolution. - const uint8_t *src_start = src - 1 * src_stride - 1; - - for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { - uint8x16_t s = vld1q_u8(src_start + i * src_stride); - uint8x8_t s0 = vget_low_u8(s); - uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); - uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); - - uint16x8_t sum01 = vaddl_u8(s0, s1); - uint16x8_t sum12 = vaddl_u8(s1, s2); - uint16x8_t sum = vaddq_u16(sum01, sum12); - - vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); - } - - // Vertical filter, using kernel {1, 0, -1}. - // Load the whole block at once to avoid redundant loads during convolution. - int16x8_t t[10]; - load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], - &t[5], &t[6], &t[7], &t[8], &t[9]); - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - // Given that the kernel is {1, 0, -1} the convolution is a simple - // subtraction. - int16x8_t diff = vsubq_s16(t[i], t[i + 2]); - - vst1q_s16(dst + i * dst_stride, diff); - } -} - // Computes the components of the system of equations used to solve for // a flow vector. // diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h new file mode 100644 index 0000000000..d991a13460 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include <arm_neon.h> +#include <math.h> + +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { + // Check that the fractional position is in range. + // + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. + // Mathematically, this implies that 0 <= x < 1. However, in practice it is + // possible to have x == 1 due to floating point rounding. This is fine, + // and we still interpolate correctly if we allow x = 1. + assert(0 <= x && x <= 1); + + double x2 = x * x; + double x3 = x2 * x; + kernel[0] = -0.5 * x + x2 - 0.5 * x3; + kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; + kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; + kernel[3] = -0.5 * x2 + 0.5 * x3; +} + +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { + double kernel_dbl[4]; + get_cubic_kernel_dbl(x, kernel_dbl); + + kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); + kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); + kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); + kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); +} + +static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 0, -1}. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); + } + + // Vertical filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution, one of + // which can be reused for the next line. + int16x8_t s0 = vld1q_s16(tmp); + int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); + int16x8_t sum01 = vaddq_s16(s0, s1); + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); + + int16x8_t sum12 = vaddq_s16(s1, s2); + int16x8_t sum = vaddq_s16(sum01, sum12); + + vst1q_s16(dst + i * dst_stride, sum); + + sum01 = sum12; + s1 = s2; + } +} + +static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + uint16x8_t sum01 = vaddl_u8(s0, s1); + uint16x8_t sum12 = vaddl_u8(s1, s2); + uint16x8_t sum = vaddq_u16(sum01, sum12); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); + } + + // Vertical filter, using kernel {1, 0, -1}. + // Load the whole block at once to avoid redundant loads during convolution. + int16x8_t t[10]; + load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], + &t[5], &t[6], &t[7], &t[8], &t[9]); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vsubq_s16(t[i], t[i + 2]); + + vst1q_s16(dst + i * dst_stride, diff); + } +} + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c new file mode 100644 index 0000000000..7b01e90d12 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_sve.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include <arm_neon.h> +#include <arm_sve.h> +#include <math.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/flow_estimation/arm/disflow_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, int16_t *dt) { + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + int h_kernel[4]; + int v_kernel[4]; + get_cubic_kernel_int(u_frac, h_kernel); + get_cubic_kernel_int(v_frac, v_kernel); + + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution. + const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1); + const int16x4_t h_kernel_s16 = vmovn_s32(vld1q_s32(h_kernel)); + const int16x8_t h_filter = vcombine_s16(h_kernel_s16, vdup_n_s16(0)); + const uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) { + svuint16_t r0 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 0); + svuint16_t r1 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 1); + svuint16_t r2 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 2); + svuint16_t r3 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 3); + + int16x8_t s0 = vreinterpretq_s16_u16(svget_neonq_u16(r0)); + int16x8_t s1 = vreinterpretq_s16_u16(svget_neonq_u16(r1)); + int16x8_t s2 = vreinterpretq_s16_u16(svget_neonq_u16(r2)); + int16x8_t s3 = vreinterpretq_s16_u16(svget_neonq_u16(r3)); + + int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s0, h_filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s1, h_filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s2, h_filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s3, h_filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + // 6 is the maximum allowable number of extra bits which will avoid + // the intermediate values overflowing an int16_t. The most extreme + // intermediate value occurs when: + // * The input pixels are [0, 255, 255, 0] + // * u_frac = 0.5 + // In this case, the un-scaled output is 255 * 1.125 = 286.875. + // As an integer with 6 fractional bits, that is 18360, which fits + // in an int16_t. But with 7 fractional bits it would be 36720, + // which is too large. + int16x8_t res = vcombine_s16(vrshrn_n_s32(res0, DISFLOW_INTERP_BITS - 6), + vrshrn_n_s32(res1, DISFLOW_INTERP_BITS - 6)); + + res = aom_tbl_s16(res, idx); + + vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, res); + } + + // Vertical convolution. + int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel)); + int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { + int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE); + int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE); + int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE); + int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE); + + int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3); + + int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3); + + uint8x8_t s = vld1_u8(src + (i + y) * stride + x); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3)); + + // This time, we have to round off the 6 extra bits which were kept + // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits + // of precision to match the scale of the dx and dy arrays. + sum_lo = vrshrq_n_s32(sum_lo, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + sum_hi = vrshrq_n_s32(sum_hi, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16)); + int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16)); + vst1q_s16(dt + i * DISFLOW_PATCH_SIZE, + vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi))); + } +} + +// Computes the components of the system of equations used to solve for +// a flow vector. +// +// The flow equations are a least-squares system, derived as follows: +// +// For each pixel in the patch, we calculate the current error `dt`, +// and the x and y gradients `dx` and `dy` of the source patch. +// This means that, to first order, the squared error for this pixel is +// +// (dt + u * dx + v * dy)^2 +// +// where (u, v) are the incremental changes to the flow vector. +// +// We then want to find the values of u and v which minimize the sum +// of the squared error across all pixels. Conveniently, this fits exactly +// into the form of a least squares problem, with one equation +// +// u * dx + v * dy = -dt +// +// for each pixel. +// +// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, +// and absorbing the - sign elsewhere, this results in the least squares system +// +// M = |sum(dx * dx) sum(dx * dy)| +// |sum(dx * dy) sum(dy * dy)| +// +// b = |sum(dx * dt)| +// |sum(dy * dt)| +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M_inv) { + int64x2_t sum[3] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t x = vld1q_s16(dx + i * dx_stride); + int16x8_t y = vld1q_s16(dy + i * dy_stride); + + sum[0] = aom_sdotq_s16(sum[0], x, x); + sum[1] = aom_sdotq_s16(sum[1], x, y); + sum[2] = aom_sdotq_s16(sum[2], y, y); + } + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[1], sum[2]); + int32x4_t res = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + + double M0 = (double)vgetq_lane_s32(res, 0) + 1; + double M1 = (double)vgetq_lane_s32(res, 1); + double M2 = (double)vgetq_lane_s32(res, 2); + double M3 = (double)vgetq_lane_s32(res, 3) + 1; + + // Invert matrix M. + double det = (M0 * M3) - (M1 * M2); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M3 * det_inv; + M_inv[1] = -M1 * det_inv; + M_inv[2] = -M2 * det_inv; + M_inv[3] = M0 * det_inv; +} + +static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + const int16_t *dt, int dt_stride, + int *b) { + int64x2_t b_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t dx16 = vld1q_s16(dx + i * dx_stride); + int16x8_t dy16 = vld1q_s16(dy + i * dy_stride); + int16x8_t dt16 = vld1q_s16(dt + i * dt_stride); + + b_s64[0] = aom_sdotq_s16(b_s64[0], dx16, dt16); + b_s64[1] = aom_sdotq_s16(b_s64[1], dy16, dt16); + } + + b_s64[0] = vpaddq_s64(b_s64[0], b_s64[1]); + vst1_s32(b, vmovn_s64(b_s64[0])); +} + +void aom_compute_flow_at_point_sve(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + double M_inv[4]; + int b[2]; + int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); + sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt); + compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt, + DISFLOW_PATCH_SIZE, b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff --git a/third_party/aom/aom_dsp/pyramid.c b/third_party/aom/aom_dsp/pyramid.c index 5de001dbd5..05ddbb2f5f 100644 --- a/third_party/aom/aom_dsp/pyramid.c +++ b/third_party/aom/aom_dsp/pyramid.c @@ -305,6 +305,7 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, // Fill in the remaining levels through progressive downsampling for (int level = already_filled_levels; level < n_levels; ++level) { + bool mem_status = false; PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; uint8_t *prev_buffer = prev_layer->buffer; int prev_stride = prev_layer->stride; @@ -315,6 +316,11 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, int this_height = this_layer->height; int this_stride = this_layer->stride; + // The width and height of the previous layer that needs to be considered to + // derive the current layer frame. + const int input_layer_width = this_width << 1; + const int input_layer_height = this_height << 1; + // Compute the this pyramid level by downsampling the current level. // // We downsample by a factor of exactly 2, clipping the rightmost and @@ -329,13 +335,30 @@ static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, // 2) Up/downsampling by a factor of 2 can be implemented much more // efficiently than up/downsampling by a generic ratio. // TODO(rachelbarker): Use optimized downsample-by-2 function - if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1, - prev_stride, this_buffer, this_height, this_width, - this_stride)) { - // If we can't allocate memory, we'll have to terminate early + + // SIMD support has been added specifically for cases where the downsample + // factor is exactly 2. In such instances, horizontal and vertical resizing + // is performed utilizing the down2_symeven() function, which considers the + // even dimensions of the input layer. + if (should_resize_by_half(input_layer_height, input_layer_width, + this_height, this_width)) { + assert(input_layer_height % 2 == 0 && input_layer_width % 2 == 0 && + "Input width or height cannot be odd."); + mem_status = av1_resize_plane_to_half( + prev_buffer, input_layer_height, input_layer_width, prev_stride, + this_buffer, this_height, this_width, this_stride); + } else { + mem_status = av1_resize_plane(prev_buffer, input_layer_height, + input_layer_width, prev_stride, this_buffer, + this_height, this_width, this_stride); + } + + // Terminate early in cases of memory allocation failure. + if (!mem_status) { frame_pyr->filled_levels = n_levels; return -1; } + fill_border(this_buffer, this_width, this_height, this_stride); } diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index 74318de2e5..f9bc9ac733 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -46,7 +46,6 @@ static INLINE __m128i xx_loadu_128(const void *a) { return _mm_loadu_si128((const __m128i *)a); } - // _mm_loadu_si64 has been introduced in GCC 9, reimplement the function // manually on older compilers. #if !defined(__clang__) && __GNUC_MAJOR__ < 9 diff --git a/third_party/aom/aom_util/aom_pthread.h b/third_party/aom/aom_util/aom_pthread.h index 99deeb292a..e755487ae3 100644 --- a/third_party/aom/aom_util/aom_pthread.h +++ b/third_party/aom/aom_util/aom_pthread.h @@ -28,6 +28,7 @@ extern "C" { #define NOMINMAX #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#include <errno.h> // NOLINT #include <process.h> // NOLINT #include <stddef.h> // NOLINT #include <windows.h> // NOLINT diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h index 92e162f121..80ed314752 100644 --- a/third_party/aom/aom_util/aom_thread.h +++ b/third_party/aom/aom_util/aom_thread.h @@ -21,8 +21,6 @@ extern "C" { #endif -#define MAX_NUM_THREADS 64 - // State of the worker thread object typedef enum { AVX_WORKER_STATUS_NOT_OK = 0, // object is unusable diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake index 32645f6065..b6cf974aa7 100644 --- a/third_party/aom/av1/av1.cmake +++ b/third_party/aom/av1/av1.cmake @@ -302,6 +302,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c" + "${AOM_ROOT}/av1/common/x86/resize_avx2.c" "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") @@ -375,6 +376,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD list(APPEND AOM_AV1_ENCODER_INTRIN_SVE "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c" + "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c" "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c") list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32 diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c index 2b6b1504e6..39c03c9ecb 100644 --- a/third_party/aom/av1/av1_cx_iface.c +++ b/third_party/aom/av1/av1_cx_iface.c @@ -32,6 +32,7 @@ #include "av1/common/enums.h" #include "av1/common/scale.h" #include "av1/encoder/bitstream.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/encoder_utils.h" diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c index 3aeffbb0e6..40befdf44e 100644 --- a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c @@ -80,17 +80,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; - const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2)); // Dot product constants and other shims. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // Fold horiz_const into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const + - (1 << ((ROUND0_BITS - 1) - 1))); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; @@ -334,15 +332,14 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -455,15 +452,14 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the filter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; @@ -574,15 +570,14 @@ static INLINE void dist_wtd_convolve_x_neon_dotprod( // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); // Fold round_offset into the dot-product filter correction constant. The - // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- - // rounding shifts - which are generally faster than rounding shifts on - // modern CPUs. (The extra -1 is needed because we halved the filter values.) + // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // Halve the total because we will halve the vilter values. int32x4_t correction = - vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + - (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + + (1 << (ROUND0_BITS - 1))) / + 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c index c29229eb09..132da2442b 100644 --- a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c +++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c @@ -102,14 +102,12 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod( const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); - const int32_t correction_s32 = - vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)), - vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS)))); - // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right - // shift by FILTER_BITS - instead of a first rounding right shift by + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. - int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1))); + int32x4_t correction = + vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1))); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); @@ -274,16 +272,13 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, } const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - // Dot product constants. - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single - // rounding right shift by FILTER_BITS - instead of a first rounding right - // shift by ROUND0_BITS, followed by second rounding right shift by - // FILTER_BITS - ROUND0_BITS. - // The outermost -1 is needed because we will halve the filter values. + // Dot product constants: + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. Halve the total because we will halve the filter values. const int32x4_t correction = - vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1))); + vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); if (w <= 4) { @@ -465,16 +460,13 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod( const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), vmovn_s16(x_filter_s16.val[1])); - // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // - which are generally faster than rounding shifts on modern CPUs. + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Dot product constants. - const int32x4_t correct_tmp = - vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)), - vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7))); const int32x4_t correction = - vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const); + vdupq_n_s32((128 << FILTER_BITS) + horiz_const); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); @@ -621,16 +613,15 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { const int bd = 8; - // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding - // shifts - which are generally faster than rounding shifts on modern CPUs. - // The outermost -1 is needed because we halved the filter values. - const int32_t horiz_const = - ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // Dot product constants. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); - const int32_t correction_s32 = - vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); - const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const); + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Halve the total because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index c0831330d1..6a0043c761 100644 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -458,7 +458,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats"; - specialize qw/av1_compute_stats sse4_1 avx2 neon/; + specialize qw/av1_compute_stats sse4_1 avx2 neon sve/; add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; specialize qw/av1_calc_proj_params sse4_1 avx2 neon/; add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; @@ -469,7 +469,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/; add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/; - add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; + add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/; } } @@ -554,6 +554,9 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/; } +add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col"; +specialize qw/resize_vert_dir avx2/; + add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/; diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c index 441323ab1f..2b48b9fff4 100644 --- a/third_party/aom/av1/common/resize.c +++ b/third_party/aom/av1/common/resize.c @@ -18,6 +18,7 @@ #include <string.h> #include "config/aom_config.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" @@ -216,10 +217,6 @@ const int16_t av1_resize_filter_normative[( // Filters for interpolation (full-band) - no filtering for integer pixels #define filteredinterp_filters1000 av1_resize_filter_normative -// Filters for factor of 2 downsampling. -static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; -static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; - static const InterpKernel *choose_interp_filter(int in_length, int out_length) { int out_length16 = out_length * 16; if (out_length16 >= in_length * 16) @@ -524,6 +521,59 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { } } +bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int width2, int start_col) { + bool mem_status = true; + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2); + if (arrbuf == NULL || arrbuf2 == NULL) { + mem_status = false; + goto Error; + } + + for (int i = start_col; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + down2_symeven(arrbuf, height, arrbuf2); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(arrbuf); + aom_free(arrbuf2); + return mem_status; +} + +void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf, + int height, int filtered_length, int width2) { + for (int i = 0; i < height; ++i) + down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i); +} + +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride) { + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height); + if (intbuf == NULL) { + return false; + } + + // Resize in the horizontal direction + resize_horz_dir(input, in_stride, intbuf, height, width, width2); + // Resize in the vertical direction + bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2, + width2, 0 /*start_col*/); + aom_free(intbuf); + return mem_status; +} + +// Check if both the output width and height are half of input width and +// height respectively. +bool should_resize_by_half(int height, int width, int height2, int width2) { + const bool is_width_by_2 = get_down2_length(width, 1) == width2; + const bool is_height_by_2 = get_down2_length(height, 1) == height2; + return (is_width_by_2 && is_height_by_2); +} + bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h index d573a538bf..de71f5d539 100644 --- a/third_party/aom/av1/common/resize.h +++ b/third_party/aom/av1/common/resize.h @@ -20,6 +20,10 @@ extern "C" { #endif +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); @@ -93,6 +97,12 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, bool alloc_pyramid); +bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride); + +bool should_resize_by_half(int height, int width, int height2, int width2); + // Returns 1 if a superres upscaled frame is scaled and 0 otherwise. static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling diff --git a/third_party/aom/av1/common/x86/resize_avx2.c b/third_party/aom/av1/common/x86/resize_avx2.c new file mode 100644 index 0000000000..c44edb88d9 --- /dev/null +++ b/third_party/aom/av1/common/x86/resize_avx2.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <immintrin.h> +#include <string.h> + +#include "config/av1_rtcd.h" + +#include "av1/common/resize.h" + +#include "aom_dsp/x86/synonyms.h" + +#define CAST_HI(x) _mm256_castsi128_si256(x) +#define CAST_LOW(x) _mm256_castsi256_si128(x) + +#define PROCESS_RESIZE_Y_WD16 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \ + \ + /* g0... g15 | i0... i15 */ \ + const __m256i s68 = \ + _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \ + /* h0... h15 | j0... j15 */ \ + const __m256i s79 = \ + _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \ + \ + /* g0h0... g7g7 | i0j0... i7j */ \ + s[3] = _mm256_unpacklo_epi8(s68, s79); \ + /* g8h8... g15g15 | i8j8... i15j15 */ \ + s[8] = _mm256_unpackhi_epi8(s68, s79); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_y_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20... r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + __m256i res_out_b[2] = { 0 }; \ + resize_y_convolve(s + 5, coeffs_y, res_out_b); \ + \ + /* r08... r015 */ \ + __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \ + /* r28... r215 */ \ + __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \ + res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \ + res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \ + \ + /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \ + __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \ + __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \ + /* r00... r07 | r20... r27 */ \ + res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \ + /* r08... r015 | r28... r215 */ \ + res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \ + /* r00... r015 | r20... r215 */ \ + res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \ + res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \ + res_8bit0 = _mm256_max_epu8(res_8bit0, zero); + +#define PROCESS_RESIZE_Y_WD8 \ + const int idx1 = AOMMIN(height - 1, i + 5); \ + const int idx2 = AOMMIN(height - 1, i + 6); \ + l6 = l10; \ + l7 = l11; \ + l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \ + l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \ + \ + /* g0h0... g7h7 */ \ + s67 = _mm_unpacklo_epi8(l6, l7); \ + /* i0j0...i7j7 */ \ + __m128i s89 = _mm_unpacklo_epi8(l8, l9); \ + \ + /* g0h0...g7g7 | i0j0...i7j7 */ \ + s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \ + \ + __m256i res_out[2] = { 0 }; \ + resize_y_convolve(s, coeffs_y, res_out); \ + \ + /* r00... r07 */ \ + __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ + /* r20...r27 */ \ + __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ + res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ + res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ + \ + /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \ + res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ + /* r00...r07 | r20...r27 */ \ + res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \ + res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \ + res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \ + res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero); + +static INLINE void resize_y_convolve(const __m256i *const s, + const __m256i *const coeffs, + __m256i *res_out) { + const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + const __m256i dst_0 = _mm256_add_epi16(res_0, res_1); + const __m256i dst_1 = _mm256_add_epi16(res_2, res_3); + // The sum of convolve operation crosses signed 16bit. Hence, the addition + // should happen in 32bit. + const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0)); + const __m256i dst_01 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1)); + const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1)); + const __m256i dst_11 = + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1)); + + res_out[0] = _mm256_add_epi32(dst_00, dst_10); + res_out[1] = _mm256_add_epi32(dst_01, dst_11); +} + +static INLINE void prepare_filter_coeffs(const int16_t *filter, + __m256i *const coeffs /* [4] */) { + // f0 f1 f2 f3 x x x x + const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); + // f0 f1 f2 f3 f0 f1 f2 f3 + const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44); + // f0 f1 f2 f3 f1 f0 f3 f2 + const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1); + + const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1); + + // f0 f1 f0 f1 .. + coeffs[2] = _mm256_broadcastw_epi16(filter_8bit); + // f2 f3 f2 f3 .. + coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2)); + // f3 f2 f3 f2 .. + coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6)); + // f1 f0 f1 f0 .. + coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4)); +} + +bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, + int height, int height2, int stride, int start_col) { + assert(start_col <= stride); + // For the GM tool, the input layer height or width is assured to be an even + // number. Hence the function 'down2_symodd()' is not invoked and SIMD + // optimization of the same is not implemented. + // When the input height is less than 8 and even, the potential input + // heights are limited to 2, 4, or 6. These scenarios require seperate + // handling due to padding requirements. Invoking the C function here will + // eliminate the need for conditional statements within the subsequent SIMD + // code to manage these cases. + if (height & 1 || height < 8) { + return resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, start_col); + } + + __m256i s[10], coeffs_y[4]; + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const uint8_t max_pixel = 255; + const __m256i clip_pixel = _mm256_set1_epi8(max_pixel); + const __m256i zero = _mm256_setzero_si256(); + + prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); + + const int num_col16 = stride / 16; + int remain_col = stride % 16; + // The core vertical SIMD processes 4 input rows simultaneously to generate + // output corresponding to 2 rows. To streamline the core loop and eliminate + // the need for conditional checks, the remaining rows (4 or 6) are processed + // separately. + const int remain_row = (height % 4 == 0) ? 4 : 6; + + for (int j = start_col; j < stride - remain_col; j += 16) { + const uint8_t *data = &intbuf[j]; + const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with the last available row at the top. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride)); + + // a0...a15 | c0...c15 + const __m256i s02 = + _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20); + // b0...b15 | d0...d15 + const __m256i s13 = + _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20); + // c0...c15 | e0...e15 + const __m256i s24 = + _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20); + // d0...d15 | f0...f15 + const __m256i s35 = + _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20); + // e0...e15 | g0...g15 + const __m256i s46 = + _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20); + // f0...f15 | h0...h15 + const __m256i s57 = + _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_unpacklo_epi8(s02, s13); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_unpacklo_epi8(s24, s35); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_unpacklo_epi8(s46, s57); + + // a8b8...a15b15 | c8d8...c15d15 + s[5] = _mm256_unpackhi_epi8(s02, s13); + // c8d8...c15d15 | e8f8...e15f15 + s[6] = _mm256_unpackhi_epi8(s24, s35); + // e8f8...e15f15 | g8h8...g15h15 + s[7] = _mm256_unpackhi_epi8(s46, s57); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + + _mm_storeu_si128( + (__m128i *)&output[(i / 2) * out_stride + j + out_stride], + _mm256_extracti128_si256(res_8bit0, 1)); + + // Load the required data for processing of next 4 input rows. + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride)); + + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD16 + + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + CAST_LOW(res_8bit0)); + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], + _mm256_extracti128_si256(res_8bit0, 1)); + i += 2; + + // Check if there is any remaining height to process. If so, perform the + // necessary data loading for processing the next row. + if (i < height - 1) { + l10 = l11 = l9; + const __m256i s810 = + _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); + const __m256i s911 = + _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_unpacklo_epi8(s810, s911); + // i8j8... i15j15 | k8l8... k15l15 + s[9] = _mm256_unpackhi_epi8(s810, s911); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + } + } + } + + if (remain_col > 7) { + const int processed_wd = num_col16 * 16; + remain_col = stride % 8; + + const uint8_t *data = &intbuf[processed_wd]; + + const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); + // Padding top 3 rows with available top-most row. + const __m128i l0 = l3; + const __m128i l1 = l3; + const __m128i l2 = l3; + const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); + + __m128i l6, l7, l8, l9; + __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); + __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride)); + __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride)); + + // a0b0...a7b7 + const __m128i s01 = _mm_unpacklo_epi8(l0, l1); + // c0d0...c7d7 + const __m128i s23 = _mm_unpacklo_epi8(l2, l3); + // e0f0...e7f7 + const __m128i s45 = _mm_unpacklo_epi8(l4, l5); + // g0h0...g7h7 + __m128i s67 = _mm_unpacklo_epi8(l10, l11); + + // a0b0...a7b7 | c0d0...c7d7 + s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20); + // c0d0...c7d7 | e0f0...e7f7 + s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20); + // e0f0...e7f7 | g0h0...g7h7 + s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20); + + // height to be processed here + const int process_ht = height - remain_row; + for (int i = 0; i < process_ht; i += 4) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride], + _mm256_extracti128_si256(res_a_round_1, 1)); + + const int idx7 = AOMMIN(height - 1, i + 7); + const int idx8 = AOMMIN(height - 1, i + 8); + l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride)); + l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride)); + + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + + // Process the remaining last 4 or 6 rows here. + int i = process_ht; + while (i < height - 1) { + PROCESS_RESIZE_Y_WD8 + + _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], + CAST_LOW(res_a_round_1)); + + i += 2; + + const int is_store_valid = (i < height - 1); + if (is_store_valid) + _mm_storel_epi64( + (__m128i *)&output[(i / 2) * out_stride + processed_wd], + _mm256_extracti128_si256(res_a_round_1, 1)); + i += 2; + + // Check rows are still remaining for processing. If yes do the required + // load of data for the next iteration. + if (i < height - 1) { + l10 = l11 = l9; + // k0l0... k7l7 + const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); + // i0j0... i7j7 | k0l0... k7l7 + s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + } + } + } + + if (remain_col) + return resize_vert_dir_c(intbuf, output, out_stride, height, height2, + stride, stride - remain_col); + + return true; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c index 47b5f5cfb7..8b0d3bcc7e 100644 --- a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c +++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c @@ -1008,10 +1008,13 @@ static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, } void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { + (void)dgd_avg; + (void)src_avg; assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); const int wiener_halfwin = wiener_win >> 1; diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c b/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c new file mode 100644 index 0000000000..a519ecc5f5 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <string.h> + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride, + int width, int height) { + uint32x4_t avg_u32 = vdupq_n_u32(0); + uint8x16_t ones = vdupq_n_u8(1); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b8_u32(0, width % 16); + + int h = height; + do { + int j = width; + const uint8_t *src_ptr = src; + while (j >= 16) { + uint8x16_t s = vld1q_u8(src_ptr); + avg_u32 = vdotq_u32(avg_u32, s, ones); + + j -= 16; + src_ptr += 16; + } + uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr)); + avg_u32 = vdotq_u32(avg_u32, s_end, ones); + + src += src_stride; + } while (--h != 0); + return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height)); +} + +static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, + int16_t *buf_avg, int buf_avg_stride, + int width, int height, + int downsample_factor) { + uint8x8_t avg_u8 = vdup_n_u8(avg); + + // Use a predicate to compute the last columns. + svbool_t pattern = svwhilelt_b8_u32(0, width % 8); + + uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg))); + + do { + int j = width; + const uint8_t *buf_ptr = buf; + int16_t *buf_avg_ptr = buf_avg; + while (j >= 8) { + uint8x8_t d = vld1_u8(buf_ptr); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); + + j -= 8; + buf_ptr += 8; + buf_avg_ptr += 8; + } + uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr))); + vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end))); + + buf += buf_stride; + buf_avg += buf_avg_stride; + height -= downsample_factor; + } while (height > 0); +} + +static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp, + const int wiener_win2, const int scale) { + for (int i = 0; i < wiener_win2 - 2; i = i + 2) { + // Transpose the first 2x2 square. It needs a special case as the element + // of the bottom left is on the diagonal. + int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1); + int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1); + + int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); + + vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); + vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row); + + // Transpose and store all the remaining 2x2 squares of the line. + for (int j = i + 3; j < wiener_win2; j = j + 2) { + row0 = vld1q_s64(H_tmp + i * wiener_win2 + j); + row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j); + + int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); + int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); + + vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0); + vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1); + } + } + for (int i = 0; i < wiener_win2 * wiener_win2; i++) { + H[i] += H_tmp[i] * scale; + } +} + +// Transpose the matrix that has just been computed and accumulate it in M. +static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn, + const int wiener_win, int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *M++ += (int64_t)(M_trn[tr_idx] * scale); + } + } +} + +// Swap each half of the dgd vectors so that we can accumulate the result of +// the dot-products directly in the destination matrix. +static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) { + int16x8_t dgd_trn0 = vreinterpretq_s16_s64( + vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); + int16x8_t dgd_trn1 = vreinterpretq_s16_s64( + vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); + + return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 }; +} + +static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5], + int64_t *M, int row) { + const int wiener_win = 5; + + int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); + int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); + + int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); + cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 0, cross_corr01); + + int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); + + int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); + cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 2, cross_corr23); + + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]); + M[row * wiener_win + 4] += vaddvq_s64(m4); +} + +static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7], + int64_t *M, int row) { + const int wiener_win = 7; + + int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); + int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); + + int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); + cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 0, cross_corr01); + + int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); + + int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); + cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 2, cross_corr23); + + int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4); + int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]); + + int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0); + cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1); + vst1q_s64(M + row * wiener_win + 4, cross_corr45); + + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]); + M[row * wiener_win + 6] += vaddvq_s64(m6); +} + +static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, + const int wiener_win, + const int wiener_win2) { + for (int row0 = 0; row0 < wiener_win; row0++) { + for (int row1 = row0; row1 < wiener_win; row1++) { + int auto_cov_idx = + (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; + + int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]); + H[auto_cov_idx] += vaddvq_s64(auto_cov); + } + } +} + +static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1, + int row0, int row1, int64_t *H) { + for (int col0 = 0; col0 < 5; col0++) { + int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5); + + int64x2_t h01 = vld1q_s64(H + auto_cov_idx); + int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); + + int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); + auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx, auto_cov01); + + int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); + + int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); + auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 2, auto_cov23); + + int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]); + H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4); + } +} + +static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1, + int row0, int row1, int64_t *H) { + for (int col0 = 0; col0 < 7; col0++) { + int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7); + + int64x2_t h01 = vld1q_s64(H + auto_cov_idx); + int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); + + int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); + auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx, auto_cov01); + + int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); + int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); + + int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); + auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 2, auto_cov23); + + int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4); + int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]); + + int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0); + auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1); + vst1q_s64(H + auto_cov_idx + 4, auto_cov45); + + int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]); + H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6); + } +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 7 * 7. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 49 * 49. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride, + int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, + int64_t *H, int downsample_factor) { + const int wiener_win = 7; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[49]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[49 * 49]; + memset(H_tmp, 0, sizeof(H_tmp)); + + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[7]; + load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win7(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j <= width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[7]; + load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + if (j < width) { + // Process remaining columns using a predicate to discard excess elements. + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[7]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + dgd0[5] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0)); + dgd0[6] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (28 elements). + // For the first column this will fill the upper triangle of the 7x7 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 7x7 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[7]; + load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); + + // Compute all elements from the combination of both columns (49 + // elements). + compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); + } + } + } + dgd_avg += downsample_factor * dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 7, downsample_factor); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); +} + +// This function computes two matrices: the cross-correlation between the src +// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). +// +// M is of size 5 * 5. It needs to be filled such that multiplying one element +// from src with each element of a row of the wiener window will fill one +// column of M. However this is not very convenient in terms of memory +// accesses, as it means we do contiguous loads of dgd but strided stores to M. +// As a result, we use an intermediate matrix M_trn which is instead filled +// such that one row of the wiener window gives one row of M_trn. Once fully +// computed, M_trn is then transposed to return M. +// +// H is of size 25 * 25. It is filled by multiplying every pair of elements of +// the wiener window together. Since it is a symmetric matrix, we only compute +// the upper triangle, and then copy it down to the lower one. Here we fill it +// by taking each different pair of columns, and multiplying all the elements of +// the first one with all the elements of the second one, with a special case +// when multiplying a column by itself. +static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride, + int16_t *src_avg, int src_avg_stride, + int width, int height, int64_t *M, + int64_t *H, int downsample_factor) { + const int wiener_win = 5; + const int wiener_win2 = wiener_win * wiener_win; + + // Use a predicate to compute the last columns of the block for H. + svbool_t pattern = svwhilelt_b16_u32(0, width % 8); + + // Use intermediate matrices for H and M to perform the computation, they + // will be accumulated into the original H and M at the end. + int64_t M_trn[25]; + memset(M_trn, 0, sizeof(M_trn)); + + int64_t H_tmp[25 * 25]; + memset(H_tmp, 0, sizeof(H_tmp)); + + do { + // Cross-correlation (M). + for (int row = 0; row < wiener_win; row++) { + int j = 0; + while (j < width) { + int16x8_t dgd[5]; + load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], + &dgd[2], &dgd[3], &dgd[4]); + int16x8_t s = vld1q_s16(src_avg + j); + + // Compute all the elements of one row of M. + compute_M_one_row_win5(s, dgd, M_trn, row); + + j += 8; + } + } + + // Auto-covariance (H). + int j = 0; + while (j <= width - 8) { + for (int col0 = 0; col0 < wiener_win; col0++) { + // Load first column. + int16x8_t dgd0[5]; + load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], + &dgd0[2], &dgd0[3], &dgd0[4]); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + j += 8; + } + + // Process remaining columns using a predicate to discard excess elements. + if (j < width) { + for (int col0 = 0; col0 < wiener_win; col0++) { + int16x8_t dgd0[5]; + dgd0[0] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); + dgd0[1] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); + dgd0[2] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); + dgd0[3] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); + dgd0[4] = svget_neonq_s16( + svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); + + // Perform computation of the first column with itself (15 elements). + // For the first column this will fill the upper triangle of the 5x5 + // matrix at the top left of the H matrix. For the next columns this + // will fill the upper triangle of the other 5x5 matrices around H's + // diagonal. + compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); + + // All computation next to the matrix diagonal has already been done. + for (int col1 = col0 + 1; col1 < wiener_win; col1++) { + // Load second column and scale based on downsampling factor. + int16x8_t dgd1[5]; + load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], + &dgd1[2], &dgd1[3], &dgd1[4]); + + // Compute all elements from the combination of both columns (25 + // elements). + compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); + } + } + } + dgd_avg += downsample_factor * dgd_avg_stride; + src_avg += src_avg_stride; + } while (--height != 0); + + // Transpose M_trn. + acc_transpose_M(M, M_trn, 5, downsample_factor); + + // Copy upper triangle of H in the lower one. + copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); +} + +void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + memset(M, 0, sizeof(*M) * wiener_win * wiener_win); + + const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); + const int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + // dgd_avg and src_avg have been memset to zero before calling this + // function, so round up the stride to the next multiple of 8 so that we + // don't have to worry about a tail loop when computing M. + const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; + const int src_avg_stride = (width & ~7) + 8; + + // Compute (dgd - avg) and store it in dgd_avg. + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, + width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); + + // Compute (src - avg), downsample if necessary and store in src-avg. + const uint8_t *src_start = src + h_start + v_start * src_stride; + compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, + src_avg_stride, width, height, downsample_factor); + + const int downsample_height = height / downsample_factor; + + // Since the height is not necessarily a multiple of the downsample factor, + // the last line of src will be scaled according to how many rows remain. + const int downsample_remainder = height % downsample_factor; + + if (wiener_win == WIENER_WIN) { + compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, + width, downsample_height, M, H, downsample_factor); + } else { + compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, + width, downsample_height, M, H, downsample_factor); + } + + if (downsample_remainder > 0) { + const int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_sve( + dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, + src_avg + downsample_height * src_avg_stride, src_avg_stride, width, + 1, M, H, downsample_remainder); + } else { + compute_stats_win5_sve( + dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, + src_avg + downsample_height * src_avg_stride, src_avg_stride, width, + 1, M, H, downsample_remainder); + } + } +} diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h index 20cefa16a5..0a8b0f258a 100644 --- a/third_party/aom/av1/encoder/enc_enums.h +++ b/third_party/aom/av1/encoder/enc_enums.h @@ -12,10 +12,14 @@ #ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ #define AOM_AV1_ENCODER_ENC_ENUMS_H_ +#include "aom_ports/mem.h" + #ifdef __cplusplus extern "C" { #endif +#define MAX_NUM_THREADS 64 + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. enum { diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c index a9214f77c2..07382eb6cc 100644 --- a/third_party/aom/av1/encoder/encodeframe.c +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -537,7 +537,9 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, // Set the partition if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || (sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 && - !frame_is_intra_only(cm))) { + (!frame_is_intra_only(cm) && + (!cpi->ppi->use_svc || + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) { // set a fixed-size partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index 4de5d426ce..a919bd906a 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -37,6 +37,7 @@ #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" @@ -74,7 +75,6 @@ #endif #include "aom/internal/aom_codec_internal.h" -#include "aom_util/aom_thread.h" #ifdef __cplusplus extern "C" { diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c index 755535ba51..1d0092a5ed 100644 --- a/third_party/aom/av1/encoder/ethread.c +++ b/third_party/aom/av1/encoder/ethread.c @@ -19,6 +19,7 @@ #include "av1/encoder/allintra_vis.h" #include "av1/encoder/bitstream.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encoder.h" @@ -2520,7 +2521,7 @@ void av1_tf_do_filtering_mt(AV1_COMP *cpi) { static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) { GlobalMotionInfo *gm_info = &cpi->gm_info; - JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; int total_refs = gm_info->num_ref_frames[cur_dir]; int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; @@ -2551,7 +2552,7 @@ static int gm_mt_worker_hook(void *arg1, void *unused) { AV1_COMP *cpi = thread_data->cpi; GlobalMotionInfo *gm_info = &cpi->gm_info; AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; - JobInfo *job_info = &gm_sync->job_info; + GlobalMotionJobInfo *job_info = &gm_sync->job_info; int thread_id = thread_data->thread_id; GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; #if CONFIG_MULTITHREAD @@ -2689,7 +2690,7 @@ static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) { // Implements multi-threading for global motion. void av1_global_motion_estimation_mt(AV1_COMP *cpi) { - JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; av1_zero(*job_info); diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h index de46a0e1f2..2645f93e3c 100644 --- a/third_party/aom/av1/encoder/global_motion.h +++ b/third_party/aom/av1/encoder/global_motion.h @@ -14,9 +14,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" -#include "aom_util/aom_thread.h" +#include "av1/encoder/enc_enums.h" #ifdef __cplusplus extern "C" { @@ -58,11 +57,11 @@ typedef struct { // next_frame_to_process[i] will hold the count of next reference frame to be // processed in the direction 'i'. int8_t next_frame_to_process[MAX_DIRECTIONS]; -} JobInfo; +} GlobalMotionJobInfo; typedef struct { // Data related to assigning jobs for global motion multi-threading. - JobInfo job_info; + GlobalMotionJobInfo job_info; #if CONFIG_MULTITHREAD // Mutex lock used while dispatching jobs. diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c index 57c74f66d5..08ecb8495a 100644 --- a/third_party/aom/av1/encoder/nonrd_pickmode.c +++ b/third_party/aom/av1/encoder/nonrd_pickmode.c @@ -1886,14 +1886,17 @@ static AOM_INLINE int skip_mode_by_low_temp( static AOM_INLINE int skip_mode_by_bsize_and_ref_frame( PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, - int extra_prune, unsigned int sse_zeromv_norm, int more_prune) { + int extra_prune, unsigned int sse_zeromv_norm, int more_prune, + int skip_nearmv) { const unsigned int thresh_skip_golden = 500; if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden && mode == NEWMV) return 1; - if (bsize == BLOCK_128X128 && mode == NEWMV) return 1; + if ((bsize == BLOCK_128X128 && mode == NEWMV) || + (skip_nearmv && mode == NEARMV)) + return 1; // Skip testing non-LAST if this flag is set. if (extra_prune) { @@ -2361,6 +2364,18 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME)) return true; + // Skip the mode if use reference frame mask flag is not set. + if (!search_state->use_ref_frame_mask[*ref_frame]) return true; + + // Skip mode for some modes and reference frames when + // force_zeromv_skip_for_blk flag is true. + if (x->force_zeromv_skip_for_blk && + ((!(*this_mode == NEARESTMV && + search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && + *this_mode != GLOBALMV) || + *ref_frame != LAST_FRAME)) + return true; + if (x->sb_me_block && *ref_frame == LAST_FRAME) { // We want to make sure to test the superblock MV: // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they @@ -2400,18 +2415,6 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( mi->ref_frame[0] = *ref_frame; mi->ref_frame[1] = *ref_frame2; - // Skip the mode if use reference frame mask flag is not set. - if (!search_state->use_ref_frame_mask[*ref_frame]) return true; - - // Skip mode for some modes and reference frames when - // force_zeromv_skip_for_blk flag is true. - if (x->force_zeromv_skip_for_blk && - ((!(*this_mode == NEARESTMV && - search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && - *this_mode != GLOBALMV) || - *ref_frame != LAST_FRAME)) - return true; - // Skip compound mode based on variance of previously evaluated single // reference modes. if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred && @@ -2478,7 +2481,8 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( // properties. if (skip_mode_by_bsize_and_ref_frame( *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search, - sse_zeromv_norm, rt_sf->nonrd_aggressive_skip)) + sse_zeromv_norm, rt_sf->nonrd_aggressive_skip, + rt_sf->increase_source_sad_thresh)) return true; // Skip mode based on low temporal variance and souce sad. diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c index 61d49a23f2..30ea7d9140 100644 --- a/third_party/aom/av1/encoder/partition_search.c +++ b/third_party/aom/av1/encoder/partition_search.c @@ -2323,8 +2323,9 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, } if (cpi->sf.rt_sf.skip_cdef_sb) { // cdef_strength is initialized to 1 which means skip_cdef, and is updated - // here. Check to see is skipping cdef is allowed. - // Always allow cdef_skip for seg_skip = 1. + // here. Check to see is skipping cdef is allowed. Never skip on slide/scene + // change, near a key frame, or when color sensitivity is set. Always allow + // cdef_skip for seg_skip = 1. const int allow_cdef_skipping = seg_skip || (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && @@ -2338,8 +2339,16 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, MB_MODE_INFO **mi_sb = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); - // Do not skip if intra or new mv is picked, or color sensitivity is set. - // Never skip on slide/scene change. + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + unsigned int thresh_spatial_var = + (cpi->oxcf.speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) + ? 400 + : UINT_MAX; + // For skip_cdef_sb = 1: do not skip if allow_cdef_skipping is false or + // intra or new mv is picked, with possible conidition on spatial variance. + // For skip_cdef_sb >= 2: more aggressive mode to always skip unless + // allow_cdef_skipping is false and source_variance is non-zero. if (cpi->sf.rt_sf.skip_cdef_sb >= 2) { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && @@ -2347,7 +2356,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, } else { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && allow_cdef_skipping && - !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV); + !(x->source_variance < thresh_spatial_var && + (mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV)); } // Store in the pickmode context. ctx->mic.cdef_strength = mi_sb[0]->cdef_strength; diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c index a504535028..ce0357163d 100644 --- a/third_party/aom/av1/encoder/picklpf.c +++ b/third_party/aom/av1/encoder/picklpf.c @@ -257,6 +257,8 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, inter_frame_multiplier = inter_frame_multiplier << 1; else if (cpi->rc.frame_source_sad > 50000) inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); + } else if (cpi->sf.rt_sf.use_fast_fixed_part) { + inter_frame_multiplier = inter_frame_multiplier << 1; } // These values were determined by linear fitting the result of the // searched level for 8 bit depth: diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c index b0d0d0bb78..a431c4dada 100644 --- a/third_party/aom/av1/encoder/pickrst.c +++ b/third_party/aom/av1/encoder/pickrst.c @@ -1044,10 +1044,13 @@ void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, #if CONFIG_AV1_HIGHBITDEPTH void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { + (void)dgd_avg; + (void)src_avg; int i, j, k, l; int32_t Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; @@ -1659,9 +1662,10 @@ static AOM_INLINE void search_wiener( // functions. Optimize intrinsics of HBD design similar to LBD (i.e., // pre-calculate d and s buffers and avoid most of the C operations). av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, - rsc->src_buffer, limits->h_start, limits->h_end, - limits->v_start, limits->v_end, rsc->dgd_stride, - rsc->src_stride, M, H, cm->seq_params->bit_depth); + rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, + limits->h_start, limits->h_end, limits->v_start, + limits->v_end, rsc->dgd_stride, rsc->src_stride, M, + H, cm->seq_params->bit_depth); } else { av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, limits->h_start, @@ -2081,10 +2085,9 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { // and height aligned to multiple of 16 is considered for intrinsic purpose. rsc.dgd_avg = NULL; rsc.src_avg = NULL; -#if HAVE_AVX2 || HAVE_NEON - // The buffers allocated below are used during Wiener filter processing of low - // bitdepth path. Hence, allocate the same when Wiener filter is enabled in - // low bitdepth path. +#if HAVE_AVX2 + // The buffers allocated below are used during Wiener filter processing. + // Hence, allocate the same when Wiener filter is enabled. if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; @@ -2221,7 +2224,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { best_luma_unit_size); } -#if HAVE_AVX || HAVE_NEON +#if HAVE_AVX2 if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { aom_free(cpi->pick_lr_ctxt.dgd_avg); cpi->pick_lr_ctxt.dgd_avg = NULL; diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index 256b6fc9eb..9a00042520 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -1461,7 +1461,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, // for resolutions below 720p. if (speed >= 11 && !is_720p_or_larger && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { - sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.skip_cdef_sb = 1; sf->rt_sf.force_only_last_ref = 1; sf->rt_sf.selective_cdf_update = 1; sf->rt_sf.use_nonrd_filter_search = 0; diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c index 91db3db726..fdb7c77ebc 100644 --- a/third_party/aom/av1/encoder/tune_vmaf.c +++ b/third_party/aom/av1/encoder/tune_vmaf.c @@ -247,7 +247,9 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi, // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, // all co-efficients must be even. -DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, +// The array is of size 9 to allow passing gauss_filter + 1 to +// _mm_loadu_si128() in prepare_coeffs_6t(). +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0, 8, 30, 52, 30, 8, 0, 0 }; static AOM_INLINE void gaussian_blur(const int bit_depth, const YV12_BUFFER_CONFIG *source, diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c index 6658ed39a8..1f76576c9e 100644 --- a/third_party/aom/av1/encoder/x86/pickrst_avx2.c +++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c @@ -345,21 +345,27 @@ static INLINE void compute_stats_highbd_win5_opt_avx2( } void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c index 50db305802..3617d33fef 100644 --- a/third_party/aom/av1/encoder/x86/pickrst_sse4.c +++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c @@ -524,21 +524,27 @@ static INLINE void compute_stats_highbd_win5_opt_sse4_1( } void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, - int h_end, int v_start, int v_end, - int dgd_stride, int src_stride, int64_t *M, - int64_t *H, aom_bit_depth_t bit_depth) { + const uint8_t *src8, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { + (void)dgd_avg; + (void)src_avg; compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/test/aom_image_test.cc b/third_party/aom/test/aom_image_test.cc index 03f4373f35..0dfb912215 100644 --- a/third_party/aom/test/aom_image_test.cc +++ b/third_party/aom/test/aom_image_test.cc @@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <climits> + #include "aom/aom_image.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -70,3 +72,66 @@ TEST(AomImageTest, AomImgAllocNv12) { EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr); aom_img_free(&img); } + +TEST(AomImageTest, AomImgAllocHugeWidth) { + // The stride (0x80000000 * 2) would overflow unsigned int. + aom_image_t *image = + aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The stride (0x80000000) would overflow int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The aligned width (UINT_MAX + 1) would overflow unsigned int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, UINT_MAX, 1, 1); + ASSERT_EQ(image, nullptr); + + image = aom_img_alloc_with_border(nullptr, AOM_IMG_FMT_I422, 1, INT_MAX, 1, + 0x40000000, 0); + if (image) { + uint16_t *y_plane = + reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x7ffffffe, 1, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_NV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_YV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 65536, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 285245883, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast<uint16_t *>(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } +} diff --git a/third_party/aom/test/disflow_test.cc b/third_party/aom/test/disflow_test.cc index 4f004480e2..bee9e1261c 100644 --- a/third_party/aom/test/disflow_test.cc +++ b/third_party/aom/test/disflow_test.cc @@ -124,4 +124,9 @@ INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest, ::testing::Values(aom_compute_flow_at_point_neon)); #endif +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, ComputeFlowTest, + ::testing::Values(aom_compute_flow_at_point_sve)); +#endif + } // namespace diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc index ce45394eb8..415f5de269 100644 --- a/third_party/aom/test/ethread_test.cc +++ b/third_party/aom/test/ethread_test.cc @@ -18,6 +18,7 @@ #include "test/util.h" #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" +#include "av1/encoder/enc_enums.h" #include "av1/encoder/firstpass.h" namespace { @@ -411,9 +412,7 @@ class AVxEncoderThreadTest const std::vector<size_t> ref_size_enc, const std::vector<std::string> ref_md5_enc, const std::vector<std::string> ref_md5_dec) { - // This value should be kept the same as MAX_NUM_THREADS - // in aom_thread.h - cfg_.g_threads = 64; + cfg_.g_threads = MAX_NUM_THREADS; ASSERT_NO_FATAL_FAILURE(RunLoop(video)); std::vector<size_t> multi_thr_max_row_mt_size_enc; std::vector<std::string> multi_thr_max_row_mt_md5_enc; diff --git a/third_party/aom/test/frame_resize_test.cc b/third_party/aom/test/frame_resize_test.cc new file mode 100644 index 0000000000..8891304192 --- /dev/null +++ b/third_party/aom/test/frame_resize_test.cc @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/bitops.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +using ::testing::Combine; +using ::testing::Values; +using ::testing::ValuesIn; + +using std::make_tuple; +using std::tuple; + +const int kIters = 1000; + +typedef tuple<int, int> FrameDimension; + +// Resolutions (width x height) to be tested for resizing. +const FrameDimension kFrameDim[] = { + make_tuple(3840, 2160), make_tuple(2560, 1440), make_tuple(1920, 1080), + make_tuple(1280, 720), make_tuple(640, 480), make_tuple(640, 360), + make_tuple(256, 256), +}; + +// Check that two 8-bit output buffers are identical. +void AssertOutputBufferEq(const uint8_t *p1, const uint8_t *p2, int width, + int height) { + ASSERT_TRUE(p1 != p2) << "Buffers must be at different memory locations"; + for (int j = 0; j < height; ++j) { + if (memcmp(p1, p2, sizeof(*p1) * width) == 0) { + p1 += width; + p2 += width; + continue; + } + for (int i = 0; i < width; ++i) { + ASSERT_EQ(p1[i], p2[i]) + << width << "x" << height << " Pixel mismatch at (" << i << ", " << j + << ")"; + } + } +} + +typedef bool (*LowBDResizeFunc)(uint8_t *intbuf, uint8_t *output, + int out_stride, int height, int height2, + int stride, int start_wd); +// Test parameter list: +// <tst_fun, dims> +typedef tuple<LowBDResizeFunc, FrameDimension> ResizeTestParams; + +class AV1ResizeYTest : public ::testing::TestWithParam<ResizeTestParams> { + public: + void SetUp() { + test_fun_ = GET_PARAM(0); + frame_dim_ = GET_PARAM(1); + width_ = std::get<0>(frame_dim_); + height_ = std::get<1>(frame_dim_); + const int msb = get_msb(AOMMIN(width_, height_)); + n_levels_ = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + + src_ = (uint8_t *)aom_malloc((width_ / 2) * height_ * sizeof(*src_)); + ref_dest_ = + (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*ref_dest_)); + test_dest_ = + (uint8_t *)aom_calloc((width_ * height_) / 4, sizeof(*test_dest_)); + } + + void RunTest() { + int width2 = width_, height2 = height_; + + for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8(); + for (int level = 1; level < n_levels_; level++) { + width2 = (width_ >> level); + height2 = (height_ >> level); + resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, width2, + 0); + test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0); + + AssertOutputBufferEq(ref_dest_, test_dest_, width2, height2); + } + } + + void SpeedTest() { + int width2 = width_, height2 = height_; + + for (int i = 0; i < (width_ / 2) * height_; i++) src_[i] = rng_.Rand8(); + for (int level = 1; level < n_levels_; level++) { + width2 = (width_ >> level); + height2 = (height_ >> level); + aom_usec_timer ref_timer; + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < kIters; j++) { + resize_vert_dir_c(src_, ref_dest_, width2, height2 << 1, height2, + width2, 0); + } + aom_usec_timer_mark(&ref_timer); + const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer); + + aom_usec_timer tst_timer; + aom_usec_timer_start(&tst_timer); + for (int j = 0; j < kIters; j++) { + test_fun_(src_, test_dest_, width2, height2 << 1, height2, width2, 0); + } + aom_usec_timer_mark(&tst_timer); + const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer); + + std::cout << "level: " << level << " [" << width2 << " x " << height2 + << "] C time = " << ref_time << " , SIMD time = " << tst_time + << " scaling=" << float(1.00) * ref_time / tst_time << "x \n"; + } + } + + void TearDown() { + aom_free(src_); + aom_free(ref_dest_); + aom_free(test_dest_); + } + + private: + LowBDResizeFunc test_fun_; + FrameDimension frame_dim_; + int width_; + int height_; + int n_levels_; + uint8_t *src_; + uint8_t *ref_dest_; + uint8_t *test_dest_; + libaom_test::ACMRandom rng_; +}; + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1ResizeYTest); + +TEST_P(AV1ResizeYTest, RunTest) { RunTest(); } + +TEST_P(AV1ResizeYTest, DISABLED_SpeedTest) { SpeedTest(); } + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, AV1ResizeYTest, + ::testing::Combine(::testing::Values(resize_vert_dir_avx2), + ::testing::ValuesIn(kFrameDim))); +#endif + +} // namespace diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake index e2f5da570d..2631c9fb39 100644 --- a/third_party/aom/test/test.cmake +++ b/third_party/aom/test/test.cmake @@ -209,6 +209,7 @@ if(NOT BUILD_SHARED_LIBS) "${AOM_ROOT}/test/fdct4x4_test.cc" "${AOM_ROOT}/test/fft_test.cc" "${AOM_ROOT}/test/firstpass_test.cc" + "${AOM_ROOT}/test/frame_resize_test.cc" "${AOM_ROOT}/test/fwht4x4_test.cc" "${AOM_ROOT}/test/hadamard_test.cc" "${AOM_ROOT}/test/horver_correlation_test.cc" diff --git a/third_party/aom/test/wiener_test.cc b/third_party/aom/test/wiener_test.cc index b995c84d8f..c38e10e3c2 100644 --- a/third_party/aom/test/wiener_test.cc +++ b/third_party/aom/test/wiener_test.cc @@ -397,6 +397,12 @@ INSTANTIATE_TEST_SUITE_P(NEON, WienerTest, ::testing::Values(av1_compute_stats_neon)); #endif // HAVE_NEON +#if HAVE_SVE + +INSTANTIATE_TEST_SUITE_P(SVE, WienerTest, + ::testing::Values(av1_compute_stats_sve)); +#endif // HAVE_SVE + } // namespace wiener_lowbd #if CONFIG_AV1_HIGHBITDEPTH @@ -514,25 +520,27 @@ static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8, } void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, - aom_bit_depth_t bit_depth) { + const uint8_t *src, int16_t *d, int16_t *s, + int h_start, int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) { compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { - av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M, H, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd, src, d, s, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + bit_depth); } } static const int kIterations = 100; typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd, - const uint8_t *src, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, + const uint8_t *src, int16_t *d, int16_t *s, + int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, + int64_t *M, int64_t *H, aom_bit_depth_t bit_depth); typedef std::tuple<const compute_stats_Func> WienerTestParam; @@ -546,11 +554,17 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> { dgd_buf = (uint16_t *)aom_memalign( 32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf)); ASSERT_NE(dgd_buf, nullptr); + const size_t buf_size = + sizeof(*buf) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + buf = (int16_t *)aom_memalign(32, buf_size); + ASSERT_NE(buf, nullptr); + memset(buf, 0, buf_size); target_func_ = GET_PARAM(0); } void TearDown() override { aom_free(src_buf); aom_free(dgd_buf); + aom_free(buf); } void RunWienerTest(const int32_t wiener_win, int32_t run_times, aom_bit_depth_t bit_depth); @@ -562,6 +576,7 @@ class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> { libaom_test::ACMRandom rng_; uint16_t *src_buf; uint16_t *dgd_buf; + int16_t *buf; }; void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, @@ -589,6 +604,9 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, const int dgd_stride = h_end; const int src_stride = MAX_DATA_BLOCK; const int iters = run_times == 1 ? kIterations : 2; + int16_t *dgd_avg = buf; + int16_t *src_avg = + buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX); for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) { dgd_buf[i] = rng_.Rand16() % (1 << bit_depth); @@ -601,16 +619,17 @@ void WienerTestHighbd::RunWienerTest(const int32_t wiener_win, aom_usec_timer timer; aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, - v_start, v_end, dgd_stride, src_stride, M_ref, - H_ref, bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M_ref, H_ref, bit_depth); } aom_usec_timer_mark(&timer); const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer)); aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, - dgd_stride, src_stride, M_test, H_test, bit_depth); + target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_test, H_test, + bit_depth); } aom_usec_timer_mark(&timer); const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer)); @@ -657,6 +676,9 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win, const int dgd_stride = h_end; const int src_stride = MAX_DATA_BLOCK; const int iters = 1; + int16_t *dgd_avg = buf; + int16_t *src_avg = + buf + (3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX); for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) { // Fill with alternating extreme values to maximize difference with // the average. @@ -668,12 +690,13 @@ void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win, dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin); const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf); - av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, - v_end, dgd_stride, src_stride, M_ref, H_ref, - bit_depth); + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, + h_start, h_end, v_start, v_end, dgd_stride, + src_stride, M_ref, H_ref, bit_depth); - target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end, - dgd_stride, src_stride, M_test, H_test, bit_depth); + target_func_(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M_test, H_test, + bit_depth); int failed = 0; for (int i = 0; i < wiener_win2; ++i) { |