diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:50 +0000 |
commit | def92d1b8e9d373e2f6f27c366d578d97d8960c6 (patch) | |
tree | 2ef34b9ad8bb9a9220e05d60352558b15f513894 /third_party/aom/aom_dsp | |
parent | Adding debian version 125.0.3-1. (diff) | |
download | firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.tar.xz firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.zip |
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/aom_dsp')
50 files changed, 2518 insertions, 2794 deletions
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 653f690741..de987cbd23 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -52,15 +52,12 @@ list(APPEND AOM_DSP_COMMON_SOURCES list(APPEND AOM_DSP_COMMON_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" "${AOM_ROOT}/aom_dsp/x86/convolve.h" "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" @@ -145,6 +142,9 @@ if(CONFIG_AV1_HIGHBITDEPTH) "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c") + + list(APPEND AOM_DSP_COMMON_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_sve.c") endif() if(CONFIG_AV1_DECODER) @@ -200,7 +200,8 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c") + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_avx2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") @@ -208,7 +209,6 @@ if(CONFIG_AV1_ENCODER) list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64 @@ -227,6 +227,9 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_ssse3.asm") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -493,6 +496,8 @@ function(setup_aom_dsp_targets) endif() if(HAVE_SVE) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SVE") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SVE") diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 7bb156ac59..7e746e9cb9 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -498,8 +498,8 @@ add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve_copy neon sse2 avx2/; -specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; -specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/aom_scaled_2d ssse3 neon/; @@ -509,10 +509,10 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/; add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; - specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/; + specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon sve/; add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; - specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/; + specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon sve/; } # @@ -1087,7 +1087,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/; - specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/; + specialize qw/aom_sad_skip_16x4x4d avx2 neon neon_dotprod/; specialize qw/aom_sad_skip_8x32x4d sse2 neon/; specialize qw/aom_sad_skip_8x16x4d sse2 neon/; specialize qw/aom_sad_skip_8x8x4d sse2 neon/; @@ -1116,7 +1116,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/; - specialize qw/aom_sad16x4x3d neon neon_dotprod/; + specialize qw/aom_sad16x4x3d avx2 neon neon_dotprod/; specialize qw/aom_sad8x32x3d neon/; specialize qw/aom_sad4x16x3d neon/; @@ -1264,8 +1264,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; specialize qw/aom_vector_var avx2 sse4_1 neon sve/; - # TODO(kyslov@) bring back SSE2 by extending it to 128 block size - #specialize qw/aom_vector_var neon sse2/; # # hamadard transform and satd for implmenting temporal dependency model @@ -1357,6 +1355,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/; specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/; specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/; + } elsif ($bd eq 10) { + specialize "aom_highbd_${bd}_mse16x16", qw/avx2 sse2 neon sve/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; } else { specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; @@ -1406,39 +1409,39 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_variance4x8 sse2 neon neon_dotprod/; specialize qw/aom_variance4x4 sse2 neon neon_dotprod/; - specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x8 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x4 neon sse2 ssse3/; - - specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x8 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x16 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x4 neon ssse3/; + specialize qw/aom_sub_pixel_variance4x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance4x4 neon ssse3/; + + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 neon ssse3/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { specialize qw/aom_variance4x16 neon neon_dotprod sse2/; @@ -1448,18 +1451,18 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/; specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/; - specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x16 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 neon ssse3/; specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/; specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/; @@ -1789,11 +1792,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # Flow estimation library if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { - add_proto qw/double av1_compute_cross_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2"; - specialize qw/av1_compute_cross_correlation sse4_1 avx2/; + add_proto qw/bool aom_compute_mean_stddev/, "const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev"; + specialize qw/aom_compute_mean_stddev sse4_1 avx2/; + + add_proto qw/double aom_compute_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2"; + specialize qw/aom_compute_correlation sse4_1 avx2/; add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; - specialize qw/aom_compute_flow_at_point sse4_1 neon/; + specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon/; } } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c index ac0a6efd00..c82125ba17 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c @@ -267,8 +267,6 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); const uint8x8_t range_limit = vdup_n_u8(128); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -282,46 +280,39 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + int8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -331,12 +322,13 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); + int16x4_t d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); + int16x4_t d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); + int16x4_t d3 = + convolve8_4_sdot_partial(s3456, s78910, correction, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -354,37 +346,30 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, @@ -393,23 +378,18 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); @@ -426,14 +406,14 @@ void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filter); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filter); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filter); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filter); + uint8x8_t d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, + s4567_hi, correction, filter); + uint8x8_t d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, + s5678_hi, correction, filter); + uint8x8_t d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, + s6789_hi, correction, filter); + uint8x8_t d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, + s78910_hi, correction, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c index c314c0a192..df6e4d2ab5 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c @@ -15,7 +15,6 @@ #include <string.h> #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -246,7 +245,6 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -260,31 +258,25 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + uint8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + uint8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -294,12 +286,12 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - d3 = convolve8_4_usdot_partial(s3456, s78910, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_usdot_partial(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_usdot_partial(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_usdot_partial(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -317,29 +309,21 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, @@ -348,16 +332,13 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); @@ -374,14 +355,14 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filter); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filter); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filter); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filter); + uint8x8_t d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, + s4567_hi, filter); + uint8x8_t d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, + s5678_hi, filter); + uint8x8_t d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, + s6789_hi, filter); + uint8x8_t d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, + s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); diff --git a/third_party/aom/aom_dsp/arm/aom_filter.h b/third_party/aom/aom_dsp/arm/aom_filter.h new file mode 100644 index 0000000000..9972d064fc --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_filter.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_FILTER_H_ +#define AOM_AOM_DSP_ARM_AOM_FILTER_H_ + +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE int get_filter_taps_convolve8(const int16_t *filter) { + if (filter[0] | filter[7]) { + return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; +} + +#endif // AOM_AOM_DSP_ARM_AOM_FILTER_H_ diff --git a/third_party/aom/aom_dsp/arm/aom_neon_sve2_bridge.h b/third_party/aom/aom_dsp/arm/aom_neon_sve2_bridge.h new file mode 100644 index 0000000000..6e7d2d6365 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_neon_sve2_bridge.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ +#define AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ + +#include <arm_neon_sve_bridge.h> + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +// We can access instructions exclusive to the SVE2 instruction set from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1, + uint16x8_t tbl) { + svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), + svset_neonq_s16(svundef_s16(), s1)); + return svget_neonq_s16( + svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ diff --git a/third_party/aom/aom_dsp/arm/dot_sve.h b/third_party/aom/aom_dsp/arm/aom_neon_sve_bridge.h index cf49f23606..3da80e22ba 100644 --- a/third_party/aom/aom_dsp/arm/dot_sve.h +++ b/third_party/aom/aom_dsp/arm/aom_neon_sve_bridge.h @@ -8,16 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AOM_AOM_DSP_ARM_DOT_SVE_H_ -#define AOM_AOM_DSP_ARM_DOT_SVE_H_ +#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ +#define AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ #include <arm_neon_sve_bridge.h> #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" -// Dot product instructions operating on 16-bit input elements are exclusive to -// the SVE instruction set. However, we can access these instructions from a +// We can access instructions exclusive to the SVE instruction set from a // predominantly Neon context by making use of the Neon-SVE bridge intrinsics // to reinterpret Neon vectors as SVE vectors - with the high part of the SVE // vector (if it's longer than 128 bits) being "don't care". @@ -39,4 +38,19 @@ static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { svset_neonq_s16(svundef_s16(), y))); } -#endif // AOM_AOM_DSP_ARM_DOT_SVE_H_ +#define aom_svdot_lane_s16(sum, s0, f, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \ + svset_neonq_s16(svundef_s16(), s0), \ + svset_neonq_s16(svundef_s16(), f), lane)) + +static INLINE uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) { + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), s), + svset_neonq_u16(svundef_u16(), tbl))); +} + +static INLINE int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) { + return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s), + svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ diff --git a/third_party/aom/aom_dsp/arm/avg_sve.c b/third_party/aom/aom_dsp/arm/avg_sve.c index bbf5a9447c..57a546501a 100644 --- a/third_party/aom/aom_dsp/arm/avg_sve.c +++ b/third_party/aom/aom_dsp/arm/avg_sve.c @@ -14,7 +14,7 @@ #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c index 18bdc5dbfe..f538346d8b 100644 --- a/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c +++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c @@ -15,7 +15,7 @@ #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride, diff --git a/third_party/aom/aom_dsp/arm/highbd_convolve8_sve.c b/third_party/aom/aom_dsp/arm/highbd_convolve8_sve.c new file mode 100644 index 0000000000..e57c41a0b0 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_sve.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter, + uint16x4_t max) { + int64x2_t sum[4]; + + sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter, + uint16x8_t max) { + int64x2_t sum[8]; + + sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); + sum[4] = aom_sdotq_s16(vdupq_n_s64(0), s[4], filter); + sum[5] = aom_sdotq_s16(vdupq_n_s64(0), s[5], filter); + sum[6] = aom_sdotq_s16(vdupq_n_s64(0), s[6], filter); + sum[7] = aom_sdotq_s16(vdupq_n_s64(0), s[7], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); + int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_horiz_8tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height, + int bd) { + const int16x8_t filter = vld1q_s16(filter_x); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4_h(s0, filter, max); + uint16x4_t d1 = highbd_convolve8_4_h(s1, filter, max); + uint16x4_t d2 = highbd_convolve8_4_h(s2, filter, max); + uint16x4_t d3 = highbd_convolve8_4_h(s3, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + do { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_h(s0, filter, max); + uint16x8_t d1 = highbd_convolve8_8_h(s1, filter, max); + uint16x8_t d2 = highbd_convolve8_8_h(s2, filter, max); + uint16x8_t d3 = highbd_convolve8_8_h(s3, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[16]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, +}; + +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; +// clang-format on + +static INLINE uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter, + uint16x8x2_t permute_tbl, + uint16x4_t max) { + int16x8_t permuted_samples0 = aom_tbl_s16(s, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s, permute_tbl.val[1]); + + int64x2_t sum0 = + aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples0, filter, 0); + int64x2_t sum1 = + aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples1, filter, 0); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter, + uint16x8_t idx, uint16x8_t max) { + int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + res = aom_tbl_u16(res, idx); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_horiz_4tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height, + int bd) { + const int16x8_t filter = vcombine_s16(vld1_s16(filter_x + 2), vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = highbd_convolve4_4_h(s0, filter, permute_tbl, max); + uint16x4_t d1 = highbd_convolve4_4_h(s1, filter, permute_tbl, max); + uint16x4_t d2 = highbd_convolve4_4_h(s2, filter, permute_tbl, max); + uint16x4_t d3 = highbd_convolve4_4_h(s3, filter, permute_tbl, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8_h(s0, filter, idx, max); + uint16x8_t d1 = highbd_convolve4_8_h(s1, filter, idx, max); + uint16x8_t d2 = highbd_convolve4_8_h(s2, filter, idx, max); + uint16x8_t d3 = highbd_convolve4_8_h(s3, filter, idx, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int width, int height, int bd) { + assert(x_step_q4 == 16); + assert(width >= 4 && height >= 4); + (void)filter_y; + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= SUBPEL_TAPS / 2 - 1; + + if (get_filter_taps_convolve8(filter_x) <= 4) { + highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride, + filter_x, width, height, bd); + } else { + highbd_convolve8_horiz_8tap_sve(src, src_stride, dst, dst_stride, filter_x, + width, height, bd); + } +} + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25, + // Shift left and insert two new columns in transposed 4x4 block. + 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27, + // Shift left and insert three new columns in transposed 4x4 block. + 6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29 +}; + +static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, + int16x4_t s2, int16x4_t s3, + int16x8_t res[2]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); + + int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); + int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); + + int32x4x2_t s0123 = vzipq_s32(s01, s23); + + res[0] = vreinterpretq_s16_s32(s0123.val[0]); + res[1] = vreinterpretq_s16_s32(s0123.val[1]); +} + +static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t s3, + int16x8_t res[4]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res_lo[0]: 00 10 20 30 01 11 21 31 + // res_lo[1]: 02 12 22 32 03 13 23 33 + // res_hi[0]: 04 14 24 34 05 15 25 35 + // res_hi[1]: 06 16 26 36 07 17 27 37 + + int16x8x2_t tr01_16 = vzipq_s16(s0, s1); + int16x8x2_t tr23_16 = vzipq_s16(s2, s3); + + int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]), + vreinterpretq_s32_s16(tr23_16.val[0])); + int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]), + vreinterpretq_s32_s16(tr23_16.val[1])); + + res[0] = vreinterpretq_s16_s32(tr01_32.val[0]); + res[1] = vreinterpretq_s16_s32(tr01_32.val[1]); + res[2] = vreinterpretq_s16_s32(tr23_32.val[0]); + res[3] = vreinterpretq_s16_s32(tr23_32.val[1]); +} + +static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], + uint8x16_t tbl, int16x8_t res[4]) { + int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), + vreinterpretq_s8_s16(t1[0]) }; + int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), + vreinterpretq_s8_s16(t1[1]) }; + int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]), + vreinterpretq_s8_s16(t1[2]) }; + int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]), + vreinterpretq_s8_s16(t1[3]) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); + res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl)); + res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl)); +} + +static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], + uint8x16_t tbl, int16x8_t res[2]) { + int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), + vreinterpretq_s8_s16(t1[0]) }; + int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), + vreinterpretq_s8_s16(t1[1]) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); +} + +static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum[2]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum[4]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); + + sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); + sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1); + + sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); + sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3])); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_vert_8tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height, + int bd) { + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint8x16_t merge_block_tbl[3]; + merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl); + merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16); + merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s78910[2]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s78910); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567); + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678); + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s78910[0]; + s3456[1] = s78910[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s78910[4]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s78910); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567); + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678); + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + + s3456[0] = s78910[0]; + s3456[1] = s78910[1]; + s3456[2] = s78910[2]; + s3456[3] = s78910[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_v(int16x8_t s[2], int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_v(int16x8_t s[4], int16x8_t filter, + uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0); + int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0); + + int32x4_t s0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t s4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(s0123, FILTER_BITS), + vqrshrun_n_s32(s4567, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_vert_4tap_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height, + int bd) { + const int16x8_t y_filter = + vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); + + uint8x16_t merge_block_tbl[3]; + merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl); + merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16); + merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + uint16x4_t d0 = highbd_convolve4_4_v(s0123, y_filter, max); + uint16x4_t d1 = highbd_convolve4_4_v(s1234, y_filter, max); + uint16x4_t d2 = highbd_convolve4_4_v(s2345, y_filter, max); + uint16x4_t d3 = highbd_convolve4_4_v(s3456, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample + // permute required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + uint16x8_t d0 = highbd_convolve4_8_v(s0123, y_filter, max); + uint16x8_t d1 = highbd_convolve4_8_v(s1234, y_filter, max); + uint16x8_t d2 = highbd_convolve4_8_v(s2345, y_filter, max); + uint16x8_t d3 = highbd_convolve4_8_v(s3456, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int width, int height, int bd) { + assert(y_step_q4 == 16); + assert(w >= 4 && h >= 4); + (void)filter_x; + (void)y_step_q4; + (void)x_step_q4; + + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride; + + if (get_filter_taps_convolve8(filter_y) <= 4) { + highbd_convolve8_vert_4tap_sve(src + 2 * src_stride, src_stride, dst, + dst_stride, filter_y, width, height, bd); + } else { + highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y, + width, height, bd); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_sve.c b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c index b267da5cfb..9ea13ab67a 100644 --- a/third_party/aom/aom_dsp/arm/highbd_sse_sve.c +++ b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c @@ -10,7 +10,7 @@ #include <arm_neon.h> -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "config/aom_dsp_rtcd.h" diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c index a2c30a1688..ad1f55e367 100644 --- a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c +++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c @@ -16,7 +16,7 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_filter.h" -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/variance.h" diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h index 52c7a34e3e..32a462a186 100644 --- a/third_party/aom/aom_dsp/arm/mem_neon.h +++ b/third_party/aom/aom_dsp/arm/mem_neon.h @@ -56,17 +56,10 @@ static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit. #if __GNUC__ < 8 - static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; return res; } - -static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { - uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), - vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; - return res; -} #endif // __GNUC__ < 8 #if __GNUC__ < 9 @@ -76,6 +69,15 @@ static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { return res; } #endif // __GNUC__ < 9 + +// vld1q_u16_x4 is defined from GCC 8.5.0 and onwards. +#if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 +static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { + uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), + vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; + return res; +} +#endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 #endif // defined(__GNUC__) && !defined(__clang__) static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, @@ -457,6 +459,16 @@ static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p, *s3 = vld1_s16(s); } +static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); +} + static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const uint8x8_t s4, @@ -525,6 +537,16 @@ static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride, vst1q_u16(s, s7); } +static INLINE void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2) { + vst1_u16(s, s0); + s += dst_stride; + vst1_u16(s, s1); + s += dst_stride; + vst1_u16(s, s2); +} + static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2, const uint16x4_t s3) { @@ -544,6 +566,16 @@ static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride, vst1q_u16(s, s1); } +static INLINE void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); + s += dst_stride; + vst1q_u16(s, s2); +} + static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1, const uint16x8_t s2, const uint16x8_t s3) { @@ -857,6 +889,16 @@ static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p, *s3 = vld1q_s16(s); } +static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); +} + // Load 2 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { uint32_t a; diff --git a/third_party/aom/aom_dsp/arm/sum_squares_sve.c b/third_party/aom/aom_dsp/arm/sum_squares_sve.c index 724e43859e..c7e6dfcb02 100644 --- a/third_party/aom/aom_dsp/arm/sum_squares_sve.c +++ b/third_party/aom/aom_dsp/arm/sum_squares_sve.c @@ -11,7 +11,7 @@ #include <arm_neon.h> -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "config/aom_dsp_rtcd.h" diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c index 284d1bd7b8..44d423dcdf 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c +++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c @@ -20,6 +20,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_mem/aom_mem.h" +#include "aom_util/aom_pthread.h" #include "av1/common/common.h" #define FAST_BARRIER 18 @@ -39,11 +40,24 @@ CornerList *av1_alloc_corner_list(void) { return corners; } -static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { - const uint8_t *buf = pyr->layers[0].buffer; - int width = pyr->layers[0].width; - int height = pyr->layers[0].height; - int stride = pyr->layers[0].stride; +static bool compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { + ImagePyramid *pyr = frame->y_pyramid; + const int layers = + aom_compute_pyramid(frame, bit_depth, downsample_level + 1, pyr); + + if (layers < 0) { + return false; + } + + // Clamp downsampling ratio base on max number of layers allowed + // for this frame size + downsample_level = layers - 1; + + const uint8_t *buf = pyr->layers[downsample_level].buffer; + int width = pyr->layers[downsample_level].width; + int height = pyr->layers[downsample_level].height; + int stride = pyr->layers[downsample_level].stride; int *scores = NULL; int num_corners; @@ -53,9 +67,11 @@ static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { if (num_corners <= MAX_CORNERS) { // Use all detected corners - if (num_corners != 0) { - memcpy(corners->corners, frame_corners_xy, - sizeof(*frame_corners_xy) * num_corners); + for (int i = 0; i < num_corners; i++) { + corners->corners[2 * i + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * i + 1] = + frame_corners_xy[i].y * (1 << downsample_level); } corners->num_corners = num_corners; } else { @@ -85,8 +101,10 @@ static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { for (int i = 0; i < num_corners; i++) { if (scores[i] > threshold) { assert(copied_corners < MAX_CORNERS); - corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x; - corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y; + corners->corners[2 * copied_corners + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * copied_corners + 1] = + frame_corners_xy[i].y * (1 << downsample_level); copied_corners += 1; } } @@ -99,7 +117,8 @@ static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { return true; } -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { assert(corners); #if CONFIG_MULTITHREAD @@ -107,7 +126,8 @@ bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { #endif // CONFIG_MULTITHREAD if (!corners->valid) { - corners->valid = compute_corner_list(pyr, corners); + corners->valid = + compute_corner_list(frame, bit_depth, downsample_level, corners); } bool valid = corners->valid; diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h index d05846ce5d..54d94309ed 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h +++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h @@ -18,7 +18,7 @@ #include <memory.h> #include "aom_dsp/pyramid.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,7 +57,8 @@ size_t av1_get_corner_list_size(void); CornerList *av1_alloc_corner_list(void); -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners); +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners); #ifndef NDEBUG // Check if a corner list has already been computed. diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c index dc7589a8c6..c78edb8910 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_match.c +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c @@ -17,62 +17,84 @@ #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/corner_match.h" +#include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/pyramid.h" #include "aom_scale/yv12config.h" -#define SEARCH_SZ 9 -#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) - #define THRESHOLD_NCC 0.75 -/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame, - centered at (x, y). +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -static double compute_variance(const unsigned char *frame, int stride, int x, - int y) { +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, + int y, double *mean, double *one_over_stddev) { int sum = 0; int sumsq = 0; - int var; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; } - var = sumsq * MATCH_SZ_SQ - sum * sum; - return (double)var; + } + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; } -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. */ -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, - int x1, int y1, - const unsigned char *frame2, int stride2, - int x2, int y2) { +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { int v1, v2; - int sum1 = 0; - int sum2 = 0; - int sumsq2 = 0; int cross = 0; - int var2, cov; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; - sum1 += v1; - sum2 += v2; - sumsq2 += v2 * v2; cross += v1 * v2; } - var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + } + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + double covariance = cross - mean1 * mean2; + double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } static int is_eligible_point(int pointx, int pointy, int width, int height) { @@ -87,65 +109,14 @@ static int is_eligible_distance(int point1x, int point1y, int point2x, (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; } -static void improve_correspondence(const unsigned char *src, - const unsigned char *ref, int width, - int height, int src_stride, int ref_stride, - Correspondence *correspondences, - int num_correspondences) { - int i; - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - // For this algorithm, all points have integer coordinates. - // It's a little more efficient to convert them to ints once, - // before the inner loops - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue; - if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height)) - continue; - match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref, - ref_stride, rx0 + x, ry0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - } - correspondences[i].rx += best_x; - correspondences[i].ry += best_y; - } - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue; - if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height)) - continue; - match_ncc = av1_compute_cross_correlation( - ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - correspondences[i].x += best_x; - correspondences[i].y += best_y; - } -} +typedef struct { + int x; + int y; + double mean; + double one_over_stddev; + int best_match_idx; + double best_match_corr; +} PointInfo; static int determine_correspondence(const unsigned char *src, const int *src_corners, int num_src_corners, @@ -154,56 +125,136 @@ static int determine_correspondence(const unsigned char *src, int width, int height, int src_stride, int ref_stride, Correspondence *correspondences) { - // TODO(sarahparker) Improve this to include 2-way match - int i, j; + PointInfo *src_point_info = NULL; + PointInfo *ref_point_info = NULL; int num_correspondences = 0; - for (i = 0; i < num_src_corners; ++i) { - double best_match_ncc = 0.0; - double template_norm; - int best_match_j = -1; - if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width, - height)) + + src_point_info = + (PointInfo *)aom_calloc(num_src_corners, sizeof(*src_point_info)); + if (!src_point_info) { + goto finished; + } + + ref_point_info = + (PointInfo *)aom_calloc(num_ref_corners, sizeof(*ref_point_info)); + if (!ref_point_info) { + goto finished; + } + + // First pass (linear): + // Filter corner lists and compute per-patch means and standard deviations, + // for the src and ref frames independently + int src_point_count = 0; + for (int i = 0; i < num_src_corners; i++) { + int src_x = src_corners[2 * i]; + int src_y = src_corners[2 * i + 1]; + if (!is_eligible_point(src_x, src_y, width, height)) continue; + + PointInfo *point = &src_point_info[src_point_count]; + point->x = src_x; + point->y = src_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(src, src_stride, src_x, src_y, &point->mean, + &point->one_over_stddev)) continue; - for (j = 0; j < num_ref_corners; ++j) { - double match_ncc; - if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, - height)) - continue; - if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1], - ref_corners[2 * j], ref_corners[2 * j + 1], - width, height)) + src_point_count++; + } + if (src_point_count == 0) { + goto finished; + } + + int ref_point_count = 0; + for (int j = 0; j < num_ref_corners; j++) { + int ref_x = ref_corners[2 * j]; + int ref_y = ref_corners[2 * j + 1]; + if (!is_eligible_point(ref_x, ref_y, width, height)) continue; + + PointInfo *point = &ref_point_info[ref_point_count]; + point->x = ref_x; + point->y = ref_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(ref, ref_stride, ref_x, ref_y, &point->mean, + &point->one_over_stddev)) + continue; + ref_point_count++; + } + if (ref_point_count == 0) { + goto finished; + } + + // Second pass (quadratic): + // For each pair of points, compute correlation, and use this to determine + // the best match of each corner, in both directions + for (int i = 0; i < src_point_count; ++i) { + PointInfo *src_point = &src_point_info[i]; + for (int j = 0; j < ref_point_count; ++j) { + PointInfo *ref_point = &ref_point_info[j]; + if (!is_eligible_distance(src_point->x, src_point->y, ref_point->x, + ref_point->y, width, height)) continue; - match_ncc = av1_compute_cross_correlation( - src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref, - ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_match_j = j; + + double corr = aom_compute_correlation( + src, src_stride, src_point->x, src_point->y, src_point->mean, + src_point->one_over_stddev, ref, ref_stride, ref_point->x, + ref_point->y, ref_point->mean, ref_point->one_over_stddev); + + if (corr > src_point->best_match_corr) { + src_point->best_match_idx = j; + src_point->best_match_corr = corr; + } + if (corr > ref_point->best_match_corr) { + ref_point->best_match_idx = i; + ref_point->best_match_corr = corr; } } - // Note: We want to test if the best correlation is >= THRESHOLD_NCC, - // but need to account for the normalization in - // av1_compute_cross_correlation. - template_norm = compute_variance(src, src_stride, src_corners[2 * i], - src_corners[2 * i + 1]); - if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { - correspondences[num_correspondences].x = src_corners[2 * i]; - correspondences[num_correspondences].y = src_corners[2 * i + 1]; - correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; - correspondences[num_correspondences].ry = - ref_corners[2 * best_match_j + 1]; + } + + // Third pass (linear): + // Scan through source corners, generating a correspondence for each corner + // iff ref_best_match[src_best_match[i]] == i + // Then refine the generated correspondences using optical flow + for (int i = 0; i < src_point_count; i++) { + PointInfo *point = &src_point_info[i]; + + // Skip corners which were not matched, or which didn't find + // a good enough match + if (point->best_match_corr < THRESHOLD_NCC) continue; + + PointInfo *match_point = &ref_point_info[point->best_match_idx]; + if (match_point->best_match_idx == i) { + // Refine match using optical flow and store + const int sx = point->x; + const int sy = point->y; + const int rx = match_point->x; + const int ry = match_point->y; + double u = (double)(rx - sx); + double v = (double)(ry - sy); + + const int patch_tl_x = sx - DISFLOW_PATCH_CENTER; + const int patch_tl_y = sy - DISFLOW_PATCH_CENTER; + + aom_compute_flow_at_point(src, ref, patch_tl_x, patch_tl_y, width, height, + src_stride, &u, &v); + + Correspondence *correspondence = &correspondences[num_correspondences]; + correspondence->x = (double)sx; + correspondence->y = (double)sy; + correspondence->rx = (double)sx + u; + correspondence->ry = (double)sy + v; num_correspondences++; } } - improve_correspondence(src, ref, width, height, src_stride, ref_stride, - correspondences, num_correspondences); + +finished: + aom_free(src_point_info); + aom_free(ref_point_info); return num_correspondences; } bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed) { + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { int num_correspondences; Correspondence *correspondences; ImagePyramid *src_pyramid = src->y_pyramid; @@ -212,19 +263,19 @@ bool av1_compute_global_motion_feature_match( CornerList *ref_corners = ref->corners; // Precompute information we will need about each frame - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + if (aom_compute_pyramid(src, bit_depth, 1, src_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (aom_compute_pyramid(ref, bit_depth, 1, ref_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(ref_pyramid, ref_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, ref_corners)) { *mem_alloc_failed = true; return false; } diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.h b/third_party/aom/aom_dsp/flow_estimation/corner_match.h index 4435d2c767..77ebee2ea3 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_match.h +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.h @@ -25,14 +25,20 @@ extern "C" { #endif -#define MATCH_SZ 13 +#define MATCH_SZ 16 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) +// Minimum threshold for the variance of a patch, in order for it to be +// considered useful for matching. +// This is evaluated against the scaled variance MATCH_SZ_SQ * sigma^2, +// so a setting of 1 * MATCH_SZ_SQ corresponds to an unscaled variance of 1 +#define MIN_FEATURE_VARIANCE (1 * MATCH_SZ_SQ) + bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed); + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c index 82b531c729..f511a6eb49 100644 --- a/third_party/aom/aom_dsp/flow_estimation/disflow.c +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c @@ -603,9 +603,9 @@ static void upscale_flow_component(double *flow, int cur_width, int cur_height, // make sure flow_u and flow_v start at 0 static bool compute_flow_field(const ImagePyramid *src_pyr, - const ImagePyramid *ref_pyr, FlowField *flow) { + const ImagePyramid *ref_pyr, int n_levels, + FlowField *flow) { bool mem_status = true; - assert(src_pyr->n_levels == ref_pyr->n_levels); double *flow_u = flow->u; double *flow_v = flow->v; @@ -613,7 +613,7 @@ static bool compute_flow_field(const ImagePyramid *src_pyr, double *tmpbuf0; double *tmpbuf; - if (src_pyr->n_levels < 2) { + if (n_levels < 2) { // tmpbuf not needed tmpbuf0 = NULL; tmpbuf = NULL; @@ -639,7 +639,7 @@ static bool compute_flow_field(const ImagePyramid *src_pyr, // correspondences by interpolating this flow field, and then refine the // correspondences themselves. This is both faster and gives better output // compared to refining the flow field at level 0 and then interpolating. - for (int level = src_pyr->n_levels - 1; level >= 1; --level) { + for (int level = n_levels - 1; level >= 1; --level) { const PyramidLayer *cur_layer = &src_pyr->layers[level]; const int cur_width = cur_layer->width; const int cur_height = cur_layer->height; @@ -762,29 +762,31 @@ static void free_flow_field(FlowField *flow) { // Following the convention in flow_estimation.h, the flow vectors are computed // at fixed points in `src` and point to the corresponding locations in `ref`, // regardless of the temporal ordering of the frames. -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed) { +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { // Precompute information we will need about each frame ImagePyramid *src_pyramid = src->y_pyramid; CornerList *src_corners = src->corners; ImagePyramid *ref_pyramid = ref->y_pyramid; - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { - *mem_alloc_failed = true; - return false; - } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + + const int src_layers = + aom_compute_pyramid(src, bit_depth, DISFLOW_PYRAMID_LEVELS, src_pyramid); + const int ref_layers = + aom_compute_pyramid(ref, bit_depth, DISFLOW_PYRAMID_LEVELS, ref_pyramid); + + if (src_layers < 0 || ref_layers < 0) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } + assert(src_layers == ref_layers); + const int src_width = src_pyramid->layers[0].width; const int src_height = src_pyramid->layers[0].height; assert(ref_pyramid->layers[0].width == src_width); @@ -796,7 +798,7 @@ bool av1_compute_global_motion_disflow(TransformationType type, return false; } - if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) { + if (!compute_flow_field(src_pyramid, ref_pyramid, src_layers, flow)) { *mem_alloc_failed = true; free_flow_field(flow); return false; diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.h b/third_party/aom/aom_dsp/flow_estimation/disflow.h index ef877b638c..ac3680004d 100644 --- a/third_party/aom/aom_dsp/flow_estimation/disflow.h +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.h @@ -15,7 +15,6 @@ #include <stdbool.h> #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_dsp/rect.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus @@ -92,12 +91,10 @@ typedef struct { int stride; } FlowField; -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed); +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c index 0f47f86f55..96624eb863 100644 --- a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c +++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c @@ -18,14 +18,6 @@ #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = { - 1, // GLOBAL_MOTION_METHOD_FEATURE_MATCH - 16, // GLOBAL_MOTION_METHOD_DISFLOW -}; - // clang-format off const double kIdentityParams[MAX_PARAMDIM] = { 0.0, 0.0, 1.0, 0.0, 0.0, 1.0 @@ -43,17 +35,17 @@ const double kIdentityParams[MAX_PARAMDIM] = { bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { switch (gm_method) { case GLOBAL_MOTION_METHOD_FEATURE_MATCH: return av1_compute_global_motion_feature_match( - type, src, ref, bit_depth, motion_models, num_motion_models, - mem_alloc_failed); + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); case GLOBAL_MOTION_METHOD_DISFLOW: - return av1_compute_global_motion_disflow(type, src, ref, bit_depth, - motion_models, num_motion_models, - mem_alloc_failed); + return av1_compute_global_motion_disflow( + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); default: assert(0 && "Unknown global motion estimation type"); } return false; diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h index 2dfae24980..a38b03fc4e 100644 --- a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h +++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h @@ -61,11 +61,6 @@ typedef struct { double rx, ry; } Correspondence; -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS]; - // Which global motion method should we use in practice? // Disflow is both faster and gives better results than feature matching in // practically all cases, so we use disflow by default @@ -85,7 +80,7 @@ extern const double kIdentityParams[MAX_PARAMDIM]; bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.c b/third_party/aom/aom_dsp/flow_estimation/ransac.c index b88a07b023..7c7bebdda4 100644 --- a/third_party/aom/aom_dsp/flow_estimation/ransac.c +++ b/third_party/aom/aom_dsp/flow_estimation/ransac.c @@ -29,8 +29,13 @@ #define INLIER_THRESHOLD 1.25 #define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD) + +// Number of initial models to generate #define NUM_TRIALS 20 +// Number of times to refine the best model found +#define NUM_REFINES 5 + // Flag to enable functions for finding TRANSLATION type models. // // These modes are not considered currently due to a spec bug (see comments @@ -39,63 +44,110 @@ // but disabled, for completeness. #define ALLOW_TRANSLATION_MODELS 0 +typedef struct { + int num_inliers; + double sse; // Sum of squared errors of inliers + int *inlier_indices; +} RANSAC_MOTION; + //////////////////////////////////////////////////////////////////////////////// // ransac -typedef bool (*IsDegenerateFunc)(double *p); -typedef bool (*FindTransformationFunc)(int points, const double *points1, - const double *points2, double *params); -typedef void (*ProjectPointsFunc)(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj); +typedef bool (*FindTransformationFunc)(const Correspondence *points, + const int *indices, int num_indices, + double *params); +typedef void (*ScoreModelFunc)(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model); // vtable-like structure which stores all of the information needed by RANSAC // for a particular model type typedef struct { - IsDegenerateFunc is_degenerate; FindTransformationFunc find_transformation; - ProjectPointsFunc project_points; + ScoreModelFunc score_model; + + // The minimum number of points which can be passed to find_transformation + // to generate a model. + // + // This should be set as small as possible. This is due to an observation + // from section 4 of "Optimal Ransac" by A. Hast, J. Nysjö and + // A. Marchetti (https://dspace5.zcu.cz/bitstream/11025/6869/1/Hast.pdf): + // using the minimum possible number of points in the initial model maximizes + // the chance that all of the selected points are inliers. + // + // That paper proposes a method which can deal with models which are + // contaminated by outliers, which helps in cases where the inlier fraction + // is low. However, for our purposes, global motion only gives significant + // gains when the inlier fraction is high. + // + // So we do not use the method from this paper, but we do find that + // minimizing the number of points used for initial model fitting helps + // make the best use of the limited number of models we consider. int minpts; } RansacModelInfo; #if ALLOW_TRANSLATION_MODELS -static void project_points_translation(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = x + mat[0]; - *(proj++) = y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_translation(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = x1 + mat[0]; + const double proj_y = y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #endif // ALLOW_TRANSLATION_MODELS -static void project_points_affine(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = mat[2] * x + mat[3] * y + mat[0]; - *(proj++) = mat[4] * x + mat[5] * y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_affine(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = mat[2] * x1 + mat[3] * y1 + mat[0]; + const double proj_y = mat[4] * x1 + mat[5] * y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #if ALLOW_TRANSLATION_MODELS -static bool find_translation(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_translation(const Correspondence *points, const int *indices, + int num_indices, double *params) { double sumx = 0; double sumy = 0; - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; sumx += dx - sx; sumy += dy - sy; @@ -111,8 +163,8 @@ static bool find_translation(int np, const double *pts1, const double *pts2, } #endif // ALLOW_TRANSLATION_MODELS -static bool find_rotzoom(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_rotzoom(const Correspondence *points, const int *indices, + int num_indices, double *params) { const int n = 4; // Size of least-squares problem double mat[4 * 4]; // Accumulator for A'A double y[4]; // Accumulator for A'b @@ -120,11 +172,12 @@ static bool find_rotzoom(int np, const double *pts1, const double *pts2, double b; // Single element of b least_squares_init(mat, y, n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0] = 1; a[1] = 0; @@ -153,8 +206,8 @@ static bool find_rotzoom(int np, const double *pts1, const double *pts2, return true; } -static bool find_affine(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_affine(const Correspondence *points, const int *indices, + int num_indices, double *params) { // Note: The least squares problem for affine models is 6-dimensional, // but it splits into two independent 3-dimensional subproblems. // Solving these two subproblems separately and recombining at the end @@ -174,11 +227,12 @@ static bool find_affine(int np, const double *pts1, const double *pts2, least_squares_init(mat[0], y[0], n); least_squares_init(mat[1], y[1], n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0][0] = 1; a[0][1] = sx; @@ -211,12 +265,6 @@ static bool find_affine(int np, const double *pts1, const double *pts2, return true; } -typedef struct { - int num_inliers; - double sse; // Sum of squared errors of inliers - int *inlier_indices; -} RANSAC_MOTION; - // Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. static int compare_motions(const void *arg_a, const void *arg_b) { const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; @@ -234,15 +282,6 @@ static bool is_better_motion(const RANSAC_MOTION *motion_a, return compare_motions(motion_a, motion_b) < 0; } -static void copy_points_at_indices(double *dest, const double *src, - const int *indices, int num_points) { - for (int i = 0; i < num_points; ++i) { - const int index = indices[i]; - dest[i * 2] = src[index * 2]; - dest[i * 2 + 1] = src[index * 2 + 1]; - } -} - // Returns true on success, false on error static bool ransac_internal(const Correspondence *matched_points, int npoints, MotionModel *motion_models, int num_desired_motions, @@ -257,10 +296,6 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, int indices[MAX_MINPTS] = { 0 }; - double *points1, *points2; - double *corners1, *corners2; - double *projected_corners; - // Store information for the num_desired_motions best transformations found // and the worst motion among them, as well as the motion currently under // consideration. @@ -271,18 +306,19 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, // currently under consideration. double params_this_motion[MAX_PARAMDIM]; + // Initialize output models, as a fallback in case we can't find a model + for (i = 0; i < num_desired_motions; i++) { + memcpy(motion_models[i].params, kIdentityParams, + MAX_PARAMDIM * sizeof(*(motion_models[i].params))); + motion_models[i].num_inliers = 0; + } + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { return false; } int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts); - points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); - points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); - corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); - corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); - projected_corners = - (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2); motions = (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION)); @@ -295,8 +331,7 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints * (num_desired_motions + 1)); - if (!(points1 && points2 && corners1 && corners2 && projected_corners && - motions && inlier_buffer)) { + if (!(motions && inlier_buffer)) { ret_val = false; *mem_alloc_failed = true; goto finish_ransac; @@ -311,50 +346,22 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, memset(¤t_motion, 0, sizeof(current_motion)); current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints; - for (i = 0; i < npoints; ++i) { - corners1[2 * i + 0] = matched_points[i].x; - corners1[2 * i + 1] = matched_points[i].y; - corners2[2 * i + 0] = matched_points[i].rx; - corners2[2 * i + 1] = matched_points[i].ry; - } - for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) { lcg_pick(npoints, minpts, indices, &seed); - copy_points_at_indices(points1, corners1, indices, minpts); - copy_points_at_indices(points2, corners2, indices, minpts); - - if (model_info->is_degenerate(points1)) { - continue; - } - - if (!model_info->find_transformation(minpts, points1, points2, + if (!model_info->find_transformation(matched_points, indices, minpts, params_this_motion)) { continue; } - model_info->project_points(params_this_motion, corners1, projected_corners, - npoints, 2, 2); - - current_motion.num_inliers = 0; - double sse = 0.0; - for (i = 0; i < npoints; ++i) { - double dx = projected_corners[i * 2] - corners2[i * 2]; - double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1]; - double squared_error = dx * dx + dy * dy; - - if (squared_error < INLIER_THRESHOLD_SQUARED) { - current_motion.inlier_indices[current_motion.num_inliers++] = i; - sse += squared_error; - } - } + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); if (current_motion.num_inliers < min_inliers) { // Reject models with too few inliers continue; } - current_motion.sse = sse; if (is_better_motion(¤t_motion, worst_kept_motion)) { // This motion is better than the worst currently kept motion. Remember // the inlier points and sse. The parameters for each kept motion @@ -386,86 +393,98 @@ static bool ransac_internal(const Correspondence *matched_points, int npoints, // Sort the motions, best first. qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); - // Recompute the motions using only the inliers. + // Refine each of the best N models using iterative estimation. + // + // The idea here is loosely based on the iterative method from + // "Locally Optimized RANSAC" by O. Chum, J. Matas and Josef Kittler: + // https://cmp.felk.cvut.cz/ftp/articles/matas/chum-dagm03.pdf + // + // However, we implement a simpler version than their proposal, and simply + // refit the model repeatedly until the number of inliers stops increasing, + // with a cap on the number of iterations to defend against edge cases which + // only improve very slowly. for (i = 0; i < num_desired_motions; ++i) { - int num_inliers = motions[i].num_inliers; - if (num_inliers > 0) { - assert(num_inliers >= minpts); - - copy_points_at_indices(points1, corners1, motions[i].inlier_indices, - num_inliers); - copy_points_at_indices(points2, corners2, motions[i].inlier_indices, - num_inliers); - - if (!model_info->find_transformation(num_inliers, points1, points2, - motion_models[i].params)) { - // In the unlikely event that this model fitting fails, - // we don't have a good fallback. So just clear the output - // model and move on - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; - continue; + if (motions[i].num_inliers <= 0) { + // Output model has already been initialized to the identity model, + // so just skip setup + continue; + } + + bool bad_model = false; + for (int refine_count = 0; refine_count < NUM_REFINES; refine_count++) { + int num_inliers = motions[i].num_inliers; + assert(num_inliers >= min_inliers); + + if (!model_info->find_transformation(matched_points, + motions[i].inlier_indices, + num_inliers, params_this_motion)) { + // In the unlikely event that this model fitting fails, we don't have a + // good fallback. So leave this model set to the identity model + bad_model = true; + break; } - // Populate inliers array - for (int j = 0; j < num_inliers; j++) { - int index = motions[i].inlier_indices[j]; - const Correspondence *corr = &matched_points[index]; - motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); - motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + // Score the newly generated model + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); + + // At this point, there are three possibilities: + // 1) If we found more inliers, keep refining. + // 2) If we found the same number of inliers but a lower SSE, we want to + // keep the new model, but further refinement is unlikely to gain much. + // So commit to this new model + // 3) It is possible, but very unlikely, that the new model will have + // fewer inliers. If it does happen, we probably just lost a few + // borderline inliers. So treat the same as case (2). + if (current_motion.num_inliers > motions[i].num_inliers) { + motions[i].num_inliers = current_motion.num_inliers; + motions[i].sse = current_motion.sse; + int *tmp = motions[i].inlier_indices; + motions[i].inlier_indices = current_motion.inlier_indices; + current_motion.inlier_indices = tmp; + } else { + // Refined model is no better, so stop + // This shouldn't be significantly worse than the previous model, + // so it's fine to use the parameters in params_this_motion. + // This saves us from having to cache the previous iteration's params. + break; } - motion_models[i].num_inliers = num_inliers; - } else { - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; } + + if (bad_model) continue; + + // Fill in output struct + memcpy(motion_models[i].params, params_this_motion, + MAX_PARAMDIM * sizeof(*motion_models[i].params)); + for (int j = 0; j < motions[i].num_inliers; j++) { + int index = motions[i].inlier_indices[j]; + const Correspondence *corr = &matched_points[index]; + motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); + motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + } + motion_models[i].num_inliers = motions[i].num_inliers; } finish_ransac: aom_free(inlier_buffer); aom_free(motions); - aom_free(projected_corners); - aom_free(corners2); - aom_free(corners1); - aom_free(points2); - aom_free(points1); return ret_val; } -static bool is_collinear3(double *p1, double *p2, double *p3) { - static const double collinear_eps = 1e-3; - const double v = - (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); - return fabs(v) < collinear_eps; -} - -#if ALLOW_TRANSLATION_MODELS -static bool is_degenerate_translation(double *p) { - return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; -} -#endif // ALLOW_TRANSLATION_MODELS - -static bool is_degenerate_affine(double *p) { - return is_collinear3(p, p + 2, p + 4); -} - static const RansacModelInfo ransac_model_info[TRANS_TYPES] = { // IDENTITY - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, // TRANSLATION #if ALLOW_TRANSLATION_MODELS - { is_degenerate_translation, find_translation, project_points_translation, - 3 }, + { find_translation, score_translation, 1 }, #else - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, #endif // ROTZOOM - { is_degenerate_affine, find_rotzoom, project_points_affine, 3 }, + { find_rotzoom, score_affine, 2 }, // AFFINE - { is_degenerate_affine, find_affine, project_points_affine, 3 }, + { find_affine, score_affine, 3 }, }; // Returns true on success, false on error diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c index 87c76fa13b..ff69ae75f5 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c @@ -17,64 +17,112 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(32, static const uint16_t, ones_array[16]) = { 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_avx2.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the -correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows -of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i, stride1_i = 0, stride2_i = 0; - __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m256i zero = _mm256_setzero_si256(); - __m128i v1, v2; - - sum_vec = zero; - sumsq2_vec = zero; - cross_vec = zero; +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, + int y, double *mean, + double *one_over_stddev) { + __m256i sum_vec = _mm256_setzero_si256(); + __m256i sumsq_vec = _mm256_setzero_si256(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame)); + + sum_vec = _mm256_add_epi16(sum_vec, v); + sumsq_vec = _mm256_add_epi32(sumsq_vec, _mm256_madd_epi16(v, v)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 8x32-bit values, hadd() to perform 8 + // additions, sum vertically to do 4 more, then the last 2 in scalar code. + const __m256i ones = _mm256_load_si256((__m256i *)ones_array); + const __m256i partial_sum = _mm256_madd_epi16(sum_vec, ones); + const __m256i tmp_8x32 = _mm256_hadd_epi32(partial_sum, sumsq_vec); + const __m128i tmp_4x32 = _mm_add_epi32(_mm256_extracti128_si256(tmp_8x32, 0), + _mm256_extracti128_si256(tmp_8x32, 1)); + const int sum = + _mm_extract_epi32(tmp_4x32, 0) + _mm_extract_epi32(tmp_4x32, 1); + const int sumsq = + _mm_extract_epi32(tmp_4x32, 2) + _mm_extract_epi32(tmp_4x32, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + __m256i cross_vec = _mm256_setzero_si256(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask); - v1_1 = _mm256_cvtepu8_epi16(v1); - v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask); - v2_1 = _mm256_cvtepu8_epi16(v2); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame1)); + const __m256i v2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame2)); - v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); - sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1, v2)); - sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); - cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); - stride1_i += stride1; - stride2_i += stride2; + frame1 += stride1; + frame2 += stride2; } - __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); - sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); - int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); - int sum2_acc = _mm256_extract_epi32(sum_vec, 4); - - __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); - __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); - temp1 = _mm256_add_epi32(unp_low, unp_hig); - - __m128i low_sumsq = _mm256_castsi256_si128(temp1); - low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); - low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); - int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); - int cross_acc = _mm_extract_epi32(low_sumsq, 2); - - int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; - int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; - return cov / sqrt((double)var2); + + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(_mm256_extracti128_si256(cross_vec, 0), + _mm256_extracti128_si256(cross_vec, 1)); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c index b3cb5bc5fd..bff7db6d2f 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c @@ -21,84 +21,125 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(16, static const uint16_t, ones_array[8]) = { 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_sse4.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * <true mean> and + *one_over_stddev = 1 / (MATCH_SZ * <true stddev>) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. +*/ +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, + int x, int y, double *mean, + double *one_over_stddev) { + // 8 16-bit partial sums of pixels + // Each lane sums at most 2*MATCH_SZ pixels, which can have values up to 255, + // and is therefore at most 2*MATCH_SZ*255, which is > 2^8 but < 2^16. + // Thus this value is safe to store in 16 bits. + __m128i sum_vec = _mm_setzero_si128(); + + // 8 32-bit partial sums of squares + __m128i sumsq_vec_l = _mm_setzero_si128(); + __m128i sumsq_vec_r = _mm_setzero_si128(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v = _mm_loadu_si128((__m128i *)frame); + const __m128i v_l = _mm_cvtepu8_epi16(v); + const __m128i v_r = _mm_cvtepu8_epi16(_mm_srli_si128(v, 8)); + + sum_vec = _mm_add_epi16(sum_vec, _mm_add_epi16(v_l, v_r)); + sumsq_vec_l = _mm_add_epi32(sumsq_vec_l, _mm_madd_epi16(v_l, v_l)); + sumsq_vec_r = _mm_add_epi32(sumsq_vec_r, _mm_madd_epi16(v_r, v_r)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 4x32-bit values, hadd() to perform four + // additions, then perform the last two additions in scalar code. + const __m128i ones = _mm_load_si128((__m128i *)ones_array); + const __m128i partial_sum = _mm_madd_epi16(sum_vec, ones); + const __m128i partial_sumsq = _mm_add_epi32(sumsq_vec_l, sumsq_vec_r); + const __m128i tmp = _mm_hadd_epi32(partial_sum, partial_sumsq); + const int sum = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1); + const int sumsq = _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. */ -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i; - // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, - // 2) - __m128i sum1_vec = _mm_setzero_si128(); - __m128i sum2_vec = _mm_setzero_si128(); - // 4 32-bit partial sums of squares - __m128i sumsq2_vec = _mm_setzero_si128(); - __m128i cross_vec = _mm_setzero_si128(); - - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m128i zero = _mm_setzero_si128(); +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + // 8 32-bit partial sums of products + __m128i cross_vec_l = _mm_setzero_si128(); + __m128i cross_vec_r = _mm_setzero_si128(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - const __m128i v1 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask); - const __m128i v2 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask); - - // Using the 'sad' intrinsic here is a bit faster than adding - // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit - // conversion step later, for a net speedup of ~10% - sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); - sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = _mm_loadu_si128((__m128i *)frame1); + const __m128i v2 = _mm_loadu_si128((__m128i *)frame2); const __m128i v1_l = _mm_cvtepu8_epi16(v1); const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); const __m128i v2_l = _mm_cvtepu8_epi16(v2); const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); - sumsq2_vec = _mm_add_epi32( - sumsq2_vec, - _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); - cross_vec = _mm_add_epi32( - cross_vec, - _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + cross_vec_l = _mm_add_epi32(cross_vec_l, _mm_madd_epi16(v1_l, v2_l)); + cross_vec_r = _mm_add_epi32(cross_vec_r, _mm_madd_epi16(v1_r, v2_r)); + + frame1 += stride1; + frame2 += stride2; } - // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, - // cross_vec) - // as holding 4 32-bit elements each, which we want to sum horizontally. - // We do this by transposing and then summing vertically. - __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); - __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); - __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); - __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); - - __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); - __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); - __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); - __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); - - __m128i res = - _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); - - int sum1 = _mm_extract_epi32(res, 0); - int sum2 = _mm_extract_epi32(res, 1); - int sumsq2 = _mm_extract_epi32(res, 2); - int cross = _mm_extract_epi32(res, 3); - - int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - int cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(cross_vec_l, cross_vec_r); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c new file mode 100644 index 0000000000..ad5a1bd7c6 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <math.h> +#include <immintrin.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "config/aom_dsp_rtcd.h" + +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_avx2.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); +} + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +// +// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation +// instad of bicubic interpolation +static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, + const int16_t *dx, const int16_t *dy, + int *b) { + const __m256i zero = _mm256_setzero_si256(); + + // Accumulate 8 32-bit partial sums for each element of b + // These will be flattened at the end. + __m256i b0_acc = _mm256_setzero_si256(); + __m256i b1_acc = _mm256_setzero_si256(); + + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); + + // Storage for intermediate values between the two convolution directions + // In the AVX2 implementation, this needs a dummy row at the end, because + // we generate 2 rows at a time but the total number of rows is odd. + // So we generate one more row than we actually need. + DECLARE_ALIGNED(32, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 4)]); + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution + + // Prepare the kernel vectors + // We split the kernel into two vectors with kernel indices: + // 0, 1, 0, 1, 0, 1, 0, 1, and + // 2, 3, 2, 3, 2, 3, 2, 3 + __m256i h_kernel_01 = _mm256_broadcastd_epi32(kernels); + __m256i h_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 8)); + + __m256i round_const_h = _mm256_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); + + for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; i += 2) { + const int y_w = y0 + i; + const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load this row of pixels. + // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, + // for a total of 11 pixels. Here we load 16 pixels, but only use + // the first 11. + __m256i row = + yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row); + + // Expand pixels to int16s + // We must use unpacks here, as we have one row in each 128-bit lane + // and want to handle each of those independently. + // This is in contrast to _mm256_cvtepu8_epi16(), which takes a single + // 128-bit input and widens it to 256 bits. + __m256i px_0to7_i16 = _mm256_unpacklo_epi8(row, zero); + __m256i px_4to10_i16 = + _mm256_unpacklo_epi8(_mm256_srli_si256(row, 4), zero); + + // Compute first four outputs + // input pixels 0, 1, 1, 2, 2, 3, 3, 4 + // * kernel 0, 1, 0, 1, 0, 1, 0, 1 + __m256i px0 = + _mm256_unpacklo_epi16(px_0to7_i16, _mm256_srli_si256(px_0to7_i16, 2)); + // input pixels 2, 3, 3, 4, 4, 5, 5, 6 + // * kernel 2, 3, 2, 3, 2, 3, 2, 3 + __m256i px1 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_0to7_i16, 4), + _mm256_srli_si256(px_0to7_i16, 6)); + // Convolve with kernel and sum 2x2 boxes to form first 4 outputs + __m256i sum0 = _mm256_add_epi32(_mm256_madd_epi16(px0, h_kernel_01), + _mm256_madd_epi16(px1, h_kernel_23)); + + __m256i out0 = _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_h), + DISFLOW_INTERP_BITS - 6); + + // Compute second four outputs + __m256i px2 = + _mm256_unpacklo_epi16(px_4to10_i16, _mm256_srli_si256(px_4to10_i16, 2)); + __m256i px3 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_4to10_i16, 4), + _mm256_srli_si256(px_4to10_i16, 6)); + __m256i sum1 = _mm256_add_epi32(_mm256_madd_epi16(px2, h_kernel_01), + _mm256_madd_epi16(px3, h_kernel_23)); + + // Round by just enough bits that the result is + // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 + // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 + // as it does now + // This means shifting down so we have 6 extra bits, for a maximum value + // of +18360, which can occur if u_frac == 0.5 and the input pixels are + // {0, 255, 255, 0}. + __m256i out1 = _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_h), + DISFLOW_INTERP_BITS - 6); + + _mm256_storeu_si256((__m256i *)tmp_row, _mm256_packs_epi32(out0, out1)); + } + + // Vertical convolution + const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; + __m256i round_const_v = _mm256_set1_epi32(1 << (round_bits - 1)); + + __m256i v_kernel_01 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 4)); + __m256i v_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 12)); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load 5 rows of 8 x 16-bit values, and pack into 4 registers + // holding rows {0, 1}, {1, 2}, {2, 3}, {3, 4} + __m128i row0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i row1 = _mm_loadu_si128((__m128i *)tmp_row); + __m128i row2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + __m128i row3 = + _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); + __m128i row4 = + _mm_loadu_si128((__m128i *)(tmp_row + 3 * DISFLOW_PATCH_SIZE)); + + __m256i px0 = _mm256_set_m128i(row1, row0); + __m256i px1 = _mm256_set_m128i(row2, row1); + __m256i px2 = _mm256_set_m128i(row3, row2); + __m256i px3 = _mm256_set_m128i(row4, row3); + + // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , + // but each multiply expands its output to 32 bits. So we need to be + // a little clever about how we do this + __m256i sum0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpacklo_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpacklo_epi16(px2, px3), v_kernel_23)); + __m256i sum1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpackhi_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpackhi_epi16(px2, px3), v_kernel_23)); + + __m256i sum0_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_v), round_bits); + __m256i sum1_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_v), round_bits); + + __m256i warped = _mm256_packs_epi32(sum0_rounded, sum1_rounded); + __m128i src_pixels_u8 = xx_loadu_2x64(&src[(y + i + 1) * stride + x], + &src[(y + i) * stride + x]); + __m256i src_pixels = + _mm256_slli_epi16(_mm256_cvtepu8_epi16(src_pixels_u8), 3); + + // Calculate delta from the target patch + __m256i dt = _mm256_sub_epi16(warped, src_pixels); + + // Load 2x8 elements each of dx and dt, to pair with the 2x8 elements of dt + // that we have just computed. Then compute 2x8 partial sums of dx * dt + // and dy * dt, implicitly sum to give 2x4 partial sums of each, and + // accumulate. + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * DISFLOW_PATCH_SIZE]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * DISFLOW_PATCH_SIZE]); + b0_acc = _mm256_add_epi32(b0_acc, _mm256_madd_epi16(dx_row, dt)); + b1_acc = _mm256_add_epi32(b1_acc, _mm256_madd_epi16(dy_row, dt)); + } + + // Flatten the two sets of partial sums to find the final value of b + // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). + // We need to do 14 additions in total; a `hadd` instruction can take care + // of eight of them, then a vertical sum can do four more, leaving two + // scalar additions. + __m256i partial_sum_256 = _mm256_hadd_epi32(b0_acc, b1_acc); + __m128i partial_sum = + _mm_add_epi32(_mm256_extracti128_si256(partial_sum_256, 0), + _mm256_extracti128_si256(partial_sum_256, 1)); + b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); + b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); +} + +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + const __m256i zero = _mm256_setzero_si256(); + + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m256i row_m1_0 = + yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1)); + __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero); + __m256i row_m1_0_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero); + __m256i row_m1_0_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 2), zero); + + __m256i row_m1_0_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_a, row_m1_0_c), + _mm256_slli_epi16(row_m1_0_b, 1)); + __m256i row_m1_0_hdiff = _mm256_sub_epi16(row_m1_0_a, row_m1_0_c); + + // Main loop: For each pair of output rows (i, i+1): + // * Load rows (i+1, i+2) and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + // Load rows (i+1, i+2) and apply both horizontal filters + const __m256i row_p1_p2 = + yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1), + (__m128i *)(src + (i + 1) * src_stride - 1)); + const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero); + const __m256i row_p1_p2_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero); + const __m256i row_p1_p2_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 2), zero); + + const __m256i row_p1_p2_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_p1_p2_a, row_p1_p2_c), + _mm256_slli_epi16(row_p1_p2_b, 1)); + const __m256i row_p1_p2_hdiff = _mm256_sub_epi16(row_p1_p2_a, row_p1_p2_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m256i row_0_p1_hdiff = + _mm256_permute2x128_si256(row_m1_0_hdiff, row_p1_p2_hdiff, 0x21); + const __m256i dx_row = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_hdiff, row_p1_p2_hdiff), + _mm256_slli_epi16(row_0_p1_hdiff, 1)); + const __m256i dy_row = + _mm256_sub_epi16(row_m1_0_hsmooth, row_p1_p2_hsmooth); + + _mm256_storeu_si256((__m256i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm256_storeu_si256((__m256i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_0_hsmooth = row_p1_p2_hsmooth; + row_m1_0_hdiff = row_p1_p2_hdiff; + } +} + +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M) { + __m256i acc[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * dx_stride]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * dy_stride]); + + acc[0] = _mm256_add_epi32(acc[0], _mm256_madd_epi16(dx_row, dx_row)); + acc[1] = _mm256_add_epi32(acc[1], _mm256_madd_epi16(dx_row, dy_row)); + // Don't compute acc[2], as it should be equal to acc[1] + acc[3] = _mm256_add_epi32(acc[3], _mm256_madd_epi16(dy_row, dy_row)); + } + + // Condense sums + __m256i partial_sum_0 = _mm256_hadd_epi32(acc[0], acc[1]); + __m256i partial_sum_1 = _mm256_hadd_epi32(acc[1], acc[3]); + __m256i result_256 = _mm256_hadd_epi32(partial_sum_0, partial_sum_1); + __m128i result = _mm_add_epi32(_mm256_extracti128_si256(result_256, 0), + _mm256_extracti128_si256(result_256, 1)); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); + + // Convert results to doubles and store + _mm256_storeu_pd(M, _mm256_cvtepi32_pd(result)); +} + +// Try to invert the matrix M +// Note: Due to the nature of how a least-squares matrix is constructed, all of +// the eigenvalues will be >= 0, and therefore det M >= 0 as well. +// The regularization term `+ k * I` further ensures that det M >= k^2. +// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. +// So we don't have to worry about non-invertible matrices here. +static INLINE void invert_2x2(const double *M, double *M_inv) { + double det = (M[0] * M[3]) - (M[1] * M[2]); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M[3] * det_inv; + M_inv[1] = -M[1] * det_inv; + M_inv[2] = -M[2] * det_inv; + M_inv[3] = M[0] * det_inv; +} + +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + DECLARE_ALIGNED(32, double, M[4]); + DECLARE_ALIGNED(32, double, M_inv[4]); + DECLARE_ALIGNED(32, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(32, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + int b[2]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter(src_patch, stride, dx, dy); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); + invert_2x2(M, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, + b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c index 2c5effd638..e0a4bd040c 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c @@ -1,13 +1,12 @@ /* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * Copyright (c) 2024, Alliance for Open Media. All rights reserved * - * This source code is subject to the terms of the BSD 3-Clause Clear License - * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear - * License was not distributed with this source code in the LICENSE file, you - * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the - * Alliance for Open Media Patent License 1.0 was not distributed with this - * source code in the PATENTS file, you can obtain it at - * aomedia.org/license/patent-license/. + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include <assert.h> @@ -20,46 +19,59 @@ #include "config/aom_dsp_rtcd.h" -// Internal cross-check against C code -// If you set this to 1 and compile in debug mode, then the outputs of the two -// convolution stages will be checked against the plain C version of the code, -// and an assertion will be fired if the results differ. -#define CHECK_RESULTS 0 - -// Note: Max sum(+ve coefficients) = 1.125 * scale -static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { - // Check that the fractional position is in range. - // - // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); - - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; -} - -static INLINE void get_cubic_kernel_int(double x, int16_t kernel[4]) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - -#if CHECK_RESULTS -static INLINE int get_cubic_value_int(const int *p, const int16_t kernel[4]) { - return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + - kernel[3] * p[3]; +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_sse4.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); } -#endif // CHECK_RESULTS // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. @@ -80,10 +92,6 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, // These will be flattened at the end. __m128i b0_acc = _mm_setzero_si128(); __m128i b1_acc = _mm_setzero_si128(); -#if CHECK_RESULTS - // Also keep a running sum using the C algorithm, for cross-checking - int c_result[2] = { 0 }; -#endif // CHECK_RESULTS // Split offset into integer and fractional parts, and compute cubic // interpolation kernels @@ -92,13 +100,11 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, const double u_frac = u - floor(u); const double v_frac = v - floor(v); - int16_t h_kernel[4]; - int16_t v_kernel[4]; - get_cubic_kernel_int(u_frac, h_kernel); - get_cubic_kernel_int(v_frac, v_kernel); + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); // Storage for intermediate values between the two convolution directions - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + DECLARE_ALIGNED(16, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]); int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the @@ -121,8 +127,8 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, // We split the kernel into two vectors with kernel indices: // 0, 1, 0, 1, 0, 1, 0, 1, and // 2, 3, 2, 3, 2, 3, 2, 3 - __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]); - __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]); + __m128i h_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 0)); + __m128i h_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 2)); __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); @@ -141,10 +147,6 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row); __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4)); - // Relevant multiply instruction - // This multiplies pointwise, then sums in pairs. - //_mm_madd_epi16(); - // Compute first four outputs // input pixels 0, 1, 1, 2, 2, 3, 3, 4 // * kernel 0, 1, 0, 1, 0, 1, 0, 1 @@ -180,43 +182,14 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, DISFLOW_INTERP_BITS - 6); _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1)); - -#if CHECK_RESULTS && !defined(NDEBUG) - // Cross-check - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - const int x_w = x0 + j; - int arr[4]; - - arr[0] = (int)ref[y_w * stride + (x_w - 1)]; - arr[1] = (int)ref[y_w * stride + (x_w + 0)]; - arr[2] = (int)ref[y_w * stride + (x_w + 1)]; - arr[3] = (int)ref[y_w * stride + (x_w + 2)]; - - // Apply kernel and round, keeping 6 extra bits of precision. - // - // 6 is the maximum allowable number of extra bits which will avoid - // the intermediate values overflowing an int16_t. The most extreme - // intermediate value occurs when: - // * The input pixels are [0, 255, 255, 0] - // * u_frac = 0.5 - // In this case, the un-scaled output is 255 * 1.125 = 286.875. - // As an integer with 6 fractional bits, that is 18360, which fits - // in an int16_t. But with 7 fractional bits it would be 36720, - // which is too large. - const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel), - DISFLOW_INTERP_BITS - 6); - (void)c_value; // Suppress warnings - assert(tmp_row[j] == c_value); - } -#endif // CHECK_RESULTS } // Vertical convolution const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1)); - __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]); - __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]); + __m128i v_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 1)); + __m128i v_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 3)); for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; @@ -259,30 +232,6 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]); b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt)); b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt)); - -#if CHECK_RESULTS - int16_t dt_arr[8]; - memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr)); - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; - int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], - p[2 * DISFLOW_PATCH_SIZE] }; - const int result = get_cubic_value_int(arr, v_kernel); - - // Apply kernel and round. - // This time, we have to round off the 6 extra bits which were kept - // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits - // of precision to match the scale of the dx and dy arrays. - const int c_warped = ROUND_POWER_OF_TWO(result, round_bits); - const int c_src_px = src[(x + j) + (y + i) * stride] << 3; - const int c_dt = c_warped - c_src_px; - - assert(dt_arr[j] == c_dt); - - c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt; - c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt; - } -#endif // CHECK_RESULTS } // Flatten the two sets of partial sums to find the final value of b @@ -292,156 +241,66 @@ static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc); b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); - -#if CHECK_RESULTS - assert(b[0] == c_result[0]); - assert(b[1] == c_result[1]); -#endif // CHECK_RESULTS } -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = _mm_sub_epi16(px0, px2); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS - } -} - -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = _mm_sub_epi16(px0, px2); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m128i row_m1 = _mm_loadu_si128((__m128i *)(src - src_stride - 1)); + __m128i row_m1_a = _mm_cvtepu8_epi16(row_m1); + __m128i row_m1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 1)); + __m128i row_m1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 2)); + + __m128i row_m1_hsmooth = _mm_add_epi16(_mm_add_epi16(row_m1_a, row_m1_c), + _mm_slli_epi16(row_m1_b, 1)); + __m128i row_m1_hdiff = _mm_sub_epi16(row_m1_a, row_m1_c); + + __m128i row = _mm_loadu_si128((__m128i *)(src - 1)); + __m128i row_a = _mm_cvtepu8_epi16(row); + __m128i row_b = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); + __m128i row_c = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); + + __m128i row_hsmooth = + _mm_add_epi16(_mm_add_epi16(row_a, row_c), _mm_slli_epi16(row_b, 1)); + __m128i row_hdiff = _mm_sub_epi16(row_a, row_c); + + // Main loop: For each of the 8 output rows: + // * Load row i+1 and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Load row i+1 and apply both horizontal filters + const __m128i row_p1 = + _mm_loadu_si128((__m128i *)(src + (i + 1) * src_stride - 1)); + const __m128i row_p1_a = _mm_cvtepu8_epi16(row_p1); + const __m128i row_p1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 1)); + const __m128i row_p1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 2)); + + const __m128i row_p1_hsmooth = _mm_add_epi16( + _mm_add_epi16(row_p1_a, row_p1_c), _mm_slli_epi16(row_p1_b, 1)); + const __m128i row_p1_hdiff = _mm_sub_epi16(row_p1_a, row_p1_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m128i dx_row = + _mm_add_epi16(_mm_add_epi16(row_m1_hdiff, row_p1_hdiff), + _mm_slli_epi16(row_hdiff, 1)); + const __m128i dy_row = _mm_sub_epi16(row_m1_hsmooth, row_p1_hsmooth); + + _mm_storeu_si128((__m128i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm_storeu_si128((__m128i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_hsmooth = row_hsmooth; + row_m1_hdiff = row_hdiff; + row_hsmooth = row_p1_hsmooth; + row_hdiff = row_p1_hdiff; } } @@ -476,30 +335,6 @@ static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, // which is convenient for integerized SIMD implementation. result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); -#if CHECK_RESULTS - int tmp[4] = { 0 }; - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { - tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; - tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; - // Don't compute tmp[2], as it should be equal to tmp[1] - tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; - } - } - - // Apply regularization - tmp[0] += 1; - tmp[3] += 1; - - tmp[2] = tmp[1]; - - assert(tmp[0] == _mm_extract_epi32(result, 0)); - assert(tmp[1] == _mm_extract_epi32(result, 1)); - assert(tmp[2] == _mm_extract_epi32(result, 2)); - assert(tmp[3] == _mm_extract_epi32(result, 3)); -#endif // CHECK_RESULTS - // Convert results to doubles and store _mm_storeu_pd(M, _mm_cvtepi32_pd(result)); _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8))); @@ -525,16 +360,15 @@ static INLINE void invert_2x2(const double *M, double *M_inv) { void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { - double M[4]; - double M_inv[4]; + DECLARE_ALIGNED(16, double, M[4]); + DECLARE_ALIGNED(16, double, M_inv[4]); + DECLARE_ALIGNED(16, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(16, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); int b[2]; - int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; - int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; - sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); - sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + sobel_filter(src_patch, stride, dx, dy); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv); diff --git a/third_party/aom/aom_dsp/mathutils.h b/third_party/aom/aom_dsp/mathutils.h index cbb6cf491f..26635fc4d1 100644 --- a/third_party/aom/aom_dsp/mathutils.h +++ b/third_party/aom/aom_dsp/mathutils.h @@ -17,7 +17,6 @@ #include <string.h> #include "aom_dsp/aom_dsp_common.h" -#include "aom_mem/aom_mem.h" static const double TINY_NEAR_ZERO = 1.0E-16; diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c index 065ec9a106..947dfd3c7a 100644 --- a/third_party/aom/aom_dsp/noise_model.c +++ b/third_party/aom/aom_dsp/noise_model.c @@ -19,6 +19,8 @@ #include "aom_dsp/noise_model.h" #include "aom_dsp/noise_util.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" #define kLowPolyNumParams 3 @@ -1555,7 +1557,7 @@ void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { } static int denoise_and_model_realloc_if_necessary( - struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + struct aom_denoise_and_model_t *ctx, const YV12_BUFFER_CONFIG *sd) { if (ctx->width == sd->y_width && ctx->height == sd->y_height && ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) return 1; @@ -1624,7 +1626,7 @@ static int denoise_and_model_realloc_if_necessary( // TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer // are null pointers) correctly. int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *sd, + const YV12_BUFFER_CONFIG *sd, aom_film_grain_t *film_grain, int apply_denoise) { const int block_size = ctx->block_size; const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h index 8228aeacfc..5b2d7efe29 100644 --- a/third_party/aom/aom_dsp/noise_model.h +++ b/third_party/aom/aom_dsp/noise_model.h @@ -297,14 +297,14 @@ struct aom_denoise_and_model_t; * aom_denoise_and_model_alloc that holds some * buffers for denoising and the current noise * estimate. - * \param[in,out] buf The raw input buffer to be denoised. + * \param[in,out] sd The raw input buffer to be denoised. * \param[out] grain Output film grain parameters * \param[in] apply_denoise Whether or not to apply the denoising to the * frame that will be encoded */ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain, - int apply_denoise); + const YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *grain, int apply_denoise); /*!\brief Allocates a context that can be used for denoising and noise modeling. * diff --git a/third_party/aom/aom_dsp/pyramid.c b/third_party/aom/aom_dsp/pyramid.c index 324a18baea..5de001dbd5 100644 --- a/third_party/aom/aom_dsp/pyramid.c +++ b/third_party/aom/aom_dsp/pyramid.c @@ -12,7 +12,7 @@ #include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" // TODO(rachelbarker): Move needed code from av1/ to aom_dsp/ #include "av1/common/resize.h" @@ -26,18 +26,16 @@ // levels. This is counted in the size checked against the max allocation // limit // * Then calls aom_alloc_pyramid() to actually create the pyramid -// * Pyramid is initially marked as invalid (no data) -// * Whenever pyramid is needed, we check the valid flag. If set, use existing -// data. If not set, compute full pyramid -// * Whenever frame buffer is reused, clear the valid flag +// * Pyramid is initially marked as containing no valid data +// * Each pyramid layer is computed on-demand, the first time it is requested +// * Whenever frame buffer is reused, reset the counter of filled levels. +// This invalidates all of the existing pyramid levels. // * Whenever frame buffer is resized, reallocate pyramid -size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, - bool image_is_16bit) { - // Limit number of levels on small frames +size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit) { + // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); - const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - n_levels = AOMMIN(n_levels, max_levels); + const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); size_t alloc_size = 0; alloc_size += sizeof(ImagePyramid); @@ -100,12 +98,10 @@ size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, return alloc_size; } -ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, - bool image_is_16bit) { - // Limit number of levels on small frames +ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit) { + // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); - const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - n_levels = AOMMIN(n_levels, max_levels); + const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr)); if (!pyr) { @@ -118,8 +114,8 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, return NULL; } - pyr->valid = false; - pyr->n_levels = n_levels; + pyr->max_levels = n_levels; + pyr->filled_levels = 0; // Compute sizes and offsets for each pyramid level // These are gathered up first, so that we can allocate all pyramid levels @@ -248,46 +244,67 @@ static INLINE void fill_border(uint8_t *img_buf, const int width, } } -// Compute coarse to fine pyramids for a frame +// Compute downsampling pyramid for a frame +// +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. +// +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +// // This must only be called while holding frame_pyr->mutex -static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *frame_pyr) { - int n_levels = frame_pyr->n_levels; +static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *frame_pyr) { + int already_filled_levels = frame_pyr->filled_levels; + + // This condition should already be enforced by aom_compute_pyramid + assert(n_levels <= frame_pyr->max_levels); + + if (already_filled_levels >= n_levels) { + return n_levels; + } + const int frame_width = frame->y_crop_width; const int frame_height = frame->y_crop_height; const int frame_stride = frame->y_stride; assert((frame_width >> n_levels) >= 0); assert((frame_height >> n_levels) >= 0); - PyramidLayer *first_layer = &frame_pyr->layers[0]; - if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { - // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits - assert(first_layer->width == frame_width); - assert(first_layer->height == frame_height); - - uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); - uint8_t *pyr_buffer = first_layer->buffer; - int pyr_stride = first_layer->stride; - for (int y = 0; y < frame_height; y++) { - uint16_t *frame_row = frame_buffer + y * frame_stride; - uint8_t *pyr_row = pyr_buffer + y * pyr_stride; - for (int x = 0; x < frame_width; x++) { - pyr_row[x] = frame_row[x] >> (bit_depth - 8); + if (already_filled_levels == 0) { + // Fill in largest level from the original image + PyramidLayer *first_layer = &frame_pyr->layers[0]; + if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { + // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits + assert(first_layer->width == frame_width); + assert(first_layer->height == frame_height); + + uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); + uint8_t *pyr_buffer = first_layer->buffer; + int pyr_stride = first_layer->stride; + for (int y = 0; y < frame_height; y++) { + uint16_t *frame_row = frame_buffer + y * frame_stride; + uint8_t *pyr_row = pyr_buffer + y * pyr_stride; + for (int x = 0; x < frame_width; x++) { + pyr_row[x] = frame_row[x] >> (bit_depth - 8); + } } + + fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); + } else { + // For frames stored in an 8-bit buffer, we don't need to copy anything - + // we can just reference the original image buffer + first_layer->buffer = frame->y_buffer; + first_layer->width = frame_width; + first_layer->height = frame_height; + first_layer->stride = frame_stride; } - fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); - } else { - // For frames stored in an 8-bit buffer, we need to configure the first - // pyramid layer to point at the original image buffer - first_layer->buffer = frame->y_buffer; - first_layer->width = frame_width; - first_layer->height = frame_height; - first_layer->stride = frame_stride; + already_filled_levels = 1; } // Fill in the remaining levels through progressive downsampling - for (int level = 1; level < n_levels; ++level) { + for (int level = already_filled_levels; level < n_levels; ++level) { PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; uint8_t *prev_buffer = prev_layer->buffer; int prev_stride = prev_layer->stride; @@ -314,11 +331,16 @@ static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, // TODO(rachelbarker): Use optimized downsample-by-2 function if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1, prev_stride, this_buffer, this_height, this_width, - this_stride)) - return false; + this_stride)) { + // If we can't allocate memory, we'll have to terminate early + frame_pyr->filled_levels = n_levels; + return -1; + } fill_border(this_buffer, this_width, this_height, this_stride); } - return true; + + frame_pyr->filled_levels = n_levels; + return n_levels; } // Fill out a downsampling pyramid for a given frame. @@ -327,63 +349,72 @@ static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // -// For small input frames, the number of levels actually constructed -// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE -// pixels along each side. +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. +// No matter how small the frame is, at least one level is guaranteed +// to be filled. // -// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, -// we will still construct the top level. -bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *pyr) { +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *pyr) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex - // before reading or writing the "valid" flag, and hold it while computing - // the pyramid, to ensure proper behaviour if multiple threads call this - // function simultaneously + // before reading or writing the filled_levels field, and hold it while + // computing any additional pyramid levels, to ensure proper behaviour + // when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - if (!pyr->valid) { - pyr->valid = fill_pyramid(frame, bit_depth, pyr); + n_levels = AOMMIN(n_levels, pyr->max_levels); + int result = n_levels; + if (pyr->filled_levels < n_levels) { + // Compute any missing levels that we need + result = fill_pyramid(frame, bit_depth, n_levels, pyr); } - bool valid = pyr->valid; - - // At this point, the pyramid is guaranteed to be valid, and can be safely - // read from without holding the mutex any more + // At this point, as long as result >= 0, the requested number of pyramid + // levels are guaranteed to be valid, and can be safely read from without + // holding the mutex any further + assert(IMPLIES(result >= 0, pyr->filled_levels >= n_levels)); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - return valid; + return result; } #ifndef NDEBUG -// Check if a pyramid has already been computed. +// Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex -// while reading the valid flag, we cannot just write: -// assert(pyr->valid); +// while reading the number of already-computed levels, we cannot just write: +// assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: -// assert(aom_is_pyramid_valid(pyr)); -bool aom_is_pyramid_valid(ImagePyramid *pyr) { +// assert(aom_is_pyramid_valid(pyr, n_levels)); +// +// Note: This deliberately does not restrict n_levels based on the maximum +// number of permitted levels for the frame size. This allows the check to +// catch cases where the caller forgets to handle the case where +// max_levels is less than the requested number of levels +bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex - // before reading or writing the "valid" flag, and hold it while computing - // the pyramid, to ensure proper behaviour if multiple threads call this - // function simultaneously + // before reading or writing the filled_levels field, to ensure proper + // behaviour when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - bool valid = pyr->valid; + bool result = (pyr->filled_levels >= n_levels); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - return valid; + return result; } #endif @@ -394,7 +425,7 @@ void aom_invalidate_pyramid(ImagePyramid *pyr) { #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - pyr->valid = false; + pyr->filled_levels = 0; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD diff --git a/third_party/aom/aom_dsp/pyramid.h b/third_party/aom/aom_dsp/pyramid.h index 9442a1ff08..745bb7e525 100644 --- a/third_party/aom/aom_dsp/pyramid.h +++ b/third_party/aom/aom_dsp/pyramid.h @@ -19,7 +19,7 @@ #include "config/aom_config.h" #include "aom_scale/yv12config.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,23 +57,31 @@ typedef struct image_pyramid { // same time // // Semantics: - // * This mutex must be held whenever reading or writing the `valid` flag + // * This mutex must be held whenever reading or writing the + // `filled_levels` field // // * This mutex must also be held while computing the image pyramid, // to ensure that only one thread may do so at a time. // - // * However, once you have read the valid flag and seen a true value, - // it is safe to drop the mutex and read from the remaining fields. - // This is because, once the image pyramid is computed, its contents + // * However, once you have read the filled_levels field and observed + // a value N, it is safe to drop the mutex and read from the remaining + // fields, including the first N pyramid levels (but no higher). + // Note that filled_levels must be read once and cached in a local variable + // in order for this to be safe - it cannot be re-read without retaking + // the mutex. + // + // This works because, once the image pyramid is computed, its contents // will not be changed until the parent frame buffer is recycled, // which will not happen until there are no more outstanding references // to the frame buffer. pthread_mutex_t mutex; #endif - // Flag indicating whether the pyramid contains valid data - bool valid; - // Number of allocated/filled levels in this pyramid - int n_levels; + // Maximum number of levels for the given frame size + // We always allocate enough memory for this many levels, as the memory + // cost of higher levels of the pyramid is minimal. + int max_levels; + // Number of levels which currently hold valid data + int filled_levels; // Pointer to allocated buffer uint8_t *buffer_alloc; // Data for each level @@ -82,11 +90,9 @@ typedef struct image_pyramid { PyramidLayer *layers; } ImagePyramid; -size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, - bool image_is_16bit); +size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit); -ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, - bool image_is_16bit); +ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit); // Fill out a downsampling pyramid for a given frame. // @@ -94,23 +100,28 @@ ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // -// For small input frames, the number of levels actually constructed -// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE -// pixels along each side. +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. // -// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, -// we will still construct the top level. -bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *pyr); +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *pyr); #ifndef NDEBUG -// Check if a pyramid has already been computed. +// Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex -// while reading the valid flag, we cannot just write: -// assert(pyr->valid); +// while reading the number of already-computed levels, we cannot just write: +// assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: -// assert(aom_is_pyramid_valid(pyr)); -bool aom_is_pyramid_valid(ImagePyramid *pyr); +// assert(aom_is_pyramid_valid(pyr, n_levels)); +// +// Note: This deliberately does not restrict n_levels based on the maximum +// number of permitted levels for the frame size. This allows the check to +// catch cases where the caller forgets to handle the case where +// max_levels is less than the requested number of levels +bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels); #endif // Mark a pyramid as no longer containing valid data. diff --git a/third_party/aom/aom_dsp/rect.h b/third_party/aom/aom_dsp/rect.h deleted file mode 100644 index 11bdaca979..0000000000 --- a/third_party/aom/aom_dsp/rect.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_RECT_H_ -#define AOM_AOM_DSP_RECT_H_ - -#include "config/aom_config.h" - -#include <stdbool.h> - -// Struct representing a rectangle of pixels. -// The axes are inclusive-exclusive, ie. the point (top, left) is included -// in the rectangle but (bottom, right) is not. -typedef struct { - int left, right, top, bottom; -} PixelRect; - -static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; } - -static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; } - -static INLINE bool is_inside_rect(const int x, const int y, - const PixelRect *r) { - return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom); -} - -#endif // AOM_AOM_DSP_RECT_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index f02c3077ae..6cdd58492a 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -10,7 +10,6 @@ */ #include <assert.h> #include <stdlib.h> -#include <string.h> #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" @@ -70,12 +69,10 @@ uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass_c( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -100,12 +97,10 @@ void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass_c( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -129,19 +124,19 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t aom_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ @@ -153,10 +148,10 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ \ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ @@ -170,10 +165,10 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ \ aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ \ @@ -730,24 +725,24 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } } -#define MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ - invert_mask); \ - return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ } MASK_SUBPIX_VAR(4, 4) @@ -924,19 +919,19 @@ static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } -#define OBMC_SUBPIX_VAR(W, H) \ - unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ } OBMC_VAR(4, 4) diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index b08ec2546b..6c7fdd6eb1 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -15,40 +15,6 @@ #include "aom_dsp/x86/convolve.h" #if HAVE_SSE2 -filter8_1dfunction aom_filter_block1d16_v8_sse2; -filter8_1dfunction aom_filter_block1d16_h8_sse2; -filter8_1dfunction aom_filter_block1d8_v8_sse2; -filter8_1dfunction aom_filter_block1d8_h8_sse2; -filter8_1dfunction aom_filter_block1d4_v8_sse2; -filter8_1dfunction aom_filter_block1d4_h8_sse2; -filter8_1dfunction aom_filter_block1d16_v4_sse2; -filter8_1dfunction aom_filter_block1d16_h4_sse2; - -filter8_1dfunction aom_filter_block1d8_h4_sse2; -filter8_1dfunction aom_filter_block1d8_v4_sse2; -filter8_1dfunction aom_filter_block1d4_h4_sse2; -filter8_1dfunction aom_filter_block1d4_v4_sse2; - -filter8_1dfunction aom_filter_block1d16_v2_sse2; -filter8_1dfunction aom_filter_block1d16_h2_sse2; -filter8_1dfunction aom_filter_block1d8_v2_sse2; -filter8_1dfunction aom_filter_block1d8_h2_sse2; -filter8_1dfunction aom_filter_block1d4_v2_sse2; -filter8_1dfunction aom_filter_block1d4_h2_sse2; - -// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) - #if CONFIG_AV1_HIGHBITDEPTH highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c deleted file mode 100644 index 5c36b68727..0000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c +++ /dev/null @@ -1,569 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_dsp_rtcd.h" -#include "aom_dsp/x86/convolve.h" -#include "aom_ports/mem.h" - -void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, - srcRegFilt32b2_2; - __m128i srcReg32b1, srcReg32b2; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); - __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // reading stride of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); - - ss_2 = _mm_srli_si128(srcReg32b2, 2); - ss_4 = _mm_srli_si128(srcReg32b2, 4); - ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_1, secondFilters); - d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); - - ss_1 = _mm_srli_si128(srcReg32b2, 3); - ss_3 = _mm_srli_si128(srcReg32b2, 5); - ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); - - res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - - src_ptr += src_pixels_per_line; - - _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; - __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; - __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); - __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); - __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); - resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); - resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); - __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); - resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); - __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); - resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); - resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); - resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); - resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); - - src_ptr += src_stride; - - _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); - _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg23_hi_1 = resReg45_hi_1; - resReg23_hi_2 = resReg45_hi_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - resReg34_hi_1 = resReg56_hi_1; - resReg34_hi_2 = resReg56_hi_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_3, secondFilters); - d2 = _mm_madd_epi16(ss_5, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg34_lo; - __m128i srcReg45_lo, srcReg56_lo; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_45_lo, resReg34_56_lo; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); - - src_ptr += src_stride; - - _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); - _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - - __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); - __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); - - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23, srcReg34, srcReg45, srcReg56; - __m128i resReg23_34, resReg45_56; - __m128i resReg23_34_45_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); - - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - tmp_0 = _mm_madd_epi16(resReg23, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34, secondFilters); - resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); - __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); - - tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); - resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); - - // shift by 6 bit each 16 bit - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); - resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_34_45_56 = - _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); - - src_ptr += src_stride; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); - *((int *)(output_ptr + out_pitch)) = - _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23 = resReg45; - resReg34 = resReg56; - srcReg4 = srcReg6; - } -} diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm deleted file mode 100644 index 640c5b2416..0000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm +++ /dev/null @@ -1,615 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - punpcklqdq xmm5, xmm4 - punpcklqdq xmm6, xmm7 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm2 - movdqa k5k4, xmm5 - movdqa k6k7, xmm6 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro APPLY_FILTER_4 1 - punpckldq xmm0, xmm1 ;two row in one register - punpckldq xmm6, xmm7 - punpckldq xmm2, xmm3 - punpckldq xmm5, xmm4 - - punpcklbw xmm0, zero ;unpack to word - punpcklbw xmm6, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - - pmullw xmm0, k0k1 ;multiply the filter factors - pmullw xmm6, k6k7 - pmullw xmm2, k2k3 - pmullw xmm5, k5k4 - - paddsw xmm0, xmm6 ;sum - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - psrldq xmm2, 8 - paddsw xmm0, xmm5 - psrldq xmm5, 8 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 -%endm - -%macro GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm0 - punpcklwd xmm1, xmm1 - punpcklwd xmm2, xmm2 - punpcklwd xmm3, xmm3 - punpckhwd xmm4, xmm4 - punpckhwd xmm5, xmm5 - punpckhwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movdqa k0, xmm0 ;store filter factors on stack - movdqa k1, xmm1 - movdqa k2, xmm2 - movdqa k3, xmm3 - movdqa k4, xmm4 - movdqa k5, xmm5 - movdqa k6, xmm6 - movdqa k7, xmm7 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro LOAD_VERT_8 1 - movq xmm0, [rsi + %1] ;0 - movq xmm1, [rsi + rax + %1] ;1 - movq xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2 + %1] ;7 - movq xmm2, [rsi + rax + %1] ;2 - movq xmm3, [rsi + rax * 2 + %1] ;3 - movq xmm4, [rsi + rdx + %1] ;4 - movq xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro APPLY_FILTER_8 2 - punpcklbw xmm0, zero - punpcklbw xmm1, zero - punpcklbw xmm6, zero - punpcklbw xmm7, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - punpcklbw xmm3, zero - punpcklbw xmm4, zero - - pmullw xmm0, k0 - pmullw xmm1, k1 - pmullw xmm6, k6 - pmullw xmm7, k7 - pmullw xmm2, k2 - pmullw xmm5, k5 - pmullw xmm3, k3 - pmullw xmm4, k4 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm6 - paddsw xmm0, xmm7 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm3 - paddsw xmm0, xmm4 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi + %2] - pavgb xmm0, xmm1 -%endif - movq [rdi + %2], xmm0 -%endm - -SECTION .text - -;void aom_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_v8_sse2) -sym(aom_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_v8_sse2) -sym(aom_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_v8_sse2) -sym(aom_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 0, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_h8_sse2) -sym(aom_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_h8_sse2) -sym(aom_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_h8_sse2) -sym(aom_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm deleted file mode 100644 index 90dd55a4be..0000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklqdq xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - pxor xmm2, xmm2 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - - punpckldq xmm0, xmm1 ;two row in one register - punpcklbw xmm0, xmm2 ;unpack to word - pmullw xmm0, xmm4 ;multiply the filter factors - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - - paddsw xmm0, xmm3 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - - pshuflw xmm6, xmm7, 11111111b ;k3 - pshufhw xmm7, xmm7, 0b ;k4 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - pxor xmm5, xmm5 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - pmullw xmm2, xmm6 - pmullw xmm3, xmm7 - - paddsw xmm0, xmm1 - paddsw xmm2, xmm3 - - paddsw xmm0, xmm4 ;rounding - paddsw xmm2, xmm4 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -SECTION .text - -globalsym(aom_filter_block1d4_v2_sse2) -sym(aom_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_v2_sse2) -sym(aom_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_v2_sse2) -sym(aom_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d4_h2_sse2) -sym(aom_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_h2_sse2) -sym(aom_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_h2_sse2) -sym(aom_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c index 9ab9143eee..0b552b704b 100644 --- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c @@ -133,7 +133,7 @@ unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { return (avg + 32) >> 6; } -void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { +static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { __m128i sum0, sum1, s0, s1, s2, s3, u0; u0 = _mm_setzero_si128(); s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0); diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h index 7ee8ba330e..e1db3b950c 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -30,6 +30,7 @@ #define SUB_EPI16 _mm_sub_epi16 #endif +#if defined(FDCT4x4_2D_HELPER) static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, __m128i *in1) { // Constants @@ -185,7 +186,9 @@ static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, } } } +#endif // defined(FDCT4x4_2D_HELPER) +#if defined(FDCT4x4_2D) void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given @@ -205,13 +208,16 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { storeu_output(&in0, output + 0 * 4); storeu_output(&in1, output + 2 * 4); } +#endif // defined(FDCT4x4_2D) +#if defined(FDCT4x4_2D_LP) void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { __m128i in0, in1; FDCT4x4_2D_HELPER(input, stride, &in0, &in1); _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); } +#endif // defined(FDCT4x4_2D_LP) #if CONFIG_INTERNAL_STATS void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c index b4ff91d856..21e9e8b282 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c @@ -618,9 +618,9 @@ static uint32_t aom_highbd_var_filter_block2d_bil_avx2( return (var > 0) ? var : 0; } -void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); for (int i = 0; i < 8; i += 2) { @@ -653,9 +653,9 @@ void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, *sse = _mm_extract_epi32(v_d, 1); } -void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi16(1); @@ -703,19 +703,19 @@ static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } -#define VAR_FN(w, h, block_size, shift) \ - uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_avx2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + highbd_calc##block_size##x##block_size##var_avx2, \ + block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14) @@ -741,6 +741,17 @@ VAR_FN(8, 32, 8, 8) #undef VAR_FN +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + highbd_calc16x16var_avx2, 16); + return *sse; +} + #define SSE2_HEIGHT(H) \ uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ @@ -749,7 +760,7 @@ VAR_FN(8, 32, 8, 8) SSE2_HEIGHT(8) SSE2_HEIGHT(16) -#undef SSE2_Height +#undef SSE2_HEIGHT #define HIGHBD_SUBPIX_VAR(W, H) \ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ @@ -782,8 +793,8 @@ HIGHBD_SUBPIX_VAR(8, 8) #undef HIGHBD_SUBPIX_VAR -uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; __m256i src0_8x16, src1_8x16, src_16x16; @@ -840,8 +851,8 @@ uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, return sum; } -uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m256i src0_8x16, src1_8x16, src_16x16; __m256i dst0_8x16, dst1_8x16, dst_16x16; @@ -897,8 +908,8 @@ uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index e897aab645..2fc2e1c0dd 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -637,8 +637,8 @@ void aom_highbd_dist_wtd_comp_avg_pred_sse2( } } -uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16; __m128i src_8x16; @@ -682,8 +682,8 @@ uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, return sum; } -uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i src_8x16; __m128i dst_8x16; @@ -728,8 +728,8 @@ uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c index fd48260c6f..869f880bda 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -940,10 +940,10 @@ static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) { return _mm_unpacklo_epi16((x), _mm_setzero_si128()); } -void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, - const uint8_t *LIBAOM_RESTRICT top_row, - const uint8_t *LIBAOM_RESTRICT left_column, int width, - int height) { +static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column, + int width, int height) { const uint8_t *const sm_weights_h = smooth_weights + height - 4; const uint8_t *const sm_weights_w = smooth_weights + width - 4; const __m128i zero = _mm_setzero_si128(); diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c index 799ce9ef44..d96a9dd23d 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c @@ -103,11 +103,12 @@ static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride, pred = _mm_packus_epi16(pred_l, pred_r); \ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); -void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_array[4], int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height, - int inv_mask, unsigned sad_array[4]) { +static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height, int inv_mask, + unsigned sad_array[4]) { const uint8_t *ref0 = ref_array[0]; const uint8_t *ref1 = ref_array[1]; const uint8_t *ref2 = ref_array[2]; @@ -164,11 +165,12 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); -void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_array[4], int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height, - int inv_mask, unsigned sad_array[4]) { +static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height, int inv_mask, + unsigned sad_array[4]) { const uint8_t *ref0 = ref_array[0]; const uint8_t *ref1 = ref_array[1]; const uint8_t *ref2 = ref_array[2]; @@ -224,22 +226,22 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, msk_stride, m, n, inv_mask, sad_array); \ } -#define MASKSAD8XN_SSSE3(n) \ - void aom_masked_sad8x##n##x4d_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref[4], \ - int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ - int msk_stride, int inv_mask, unsigned sad_array[4]) { \ - aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - 8, msk, msk_stride, n, inv_mask, sad_array); \ +#define MASKSAD8XN_SSSE3(n) \ + void aom_masked_sad8x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \ + msk, msk_stride, n, inv_mask, sad_array); \ } -#define MASKSAD4XN_SSSE3(n) \ - void aom_masked_sad4x##n##x4d_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref[4], \ - int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ - int msk_stride, int inv_mask, unsigned sad_array[4]) { \ - aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - 4, msk, msk_stride, n, inv_mask, sad_array); \ +#define MASKSAD4XN_SSSE3(n) \ + void aom_masked_sad4x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \ + msk, msk_stride, n, inv_mask, sad_array); \ } MASKSADMXN_SSSE3(128, 128) diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm index d1d8373456..f424ce01dd 100644 --- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm +++ b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm @@ -15,21 +15,6 @@ SECTION_RODATA pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 bilin_filter_m_ssse3: times 8 db 16, 0 times 8 db 14, 2 @@ -109,9 +94,6 @@ SECTION .text %if cpuflag(ssse3) %define bilin_filter_m bilin_filter_m_ssse3 %define filter_idx_shift 4 -%else -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 %endif ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses ; 11, not 13, if the registers are ordered correctly. May make a minor speed @@ -1449,21 +1431,11 @@ SECTION .text ; location in the sse/2 version, rather than duplicating that code in the ; binary. -INIT_XMM sse2 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - INIT_XMM ssse3 SUBPEL_VARIANCE 4 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 -INIT_XMM sse2 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 - INIT_XMM ssse3 SUBPEL_VARIANCE 4, 1 SUBPEL_VARIANCE 8, 1 diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index 6744ec51d0..74318de2e5 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -46,6 +46,25 @@ static INLINE __m128i xx_loadu_128(const void *a) { return _mm_loadu_si128((const __m128i *)a); } + +// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function +// manually on older compilers. +#if !defined(__clang__) && __GNUC_MAJOR__ < 9 +static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + __m64 hi_, lo_; + memcpy(&hi_, hi, sizeof(hi_)); + memcpy(&lo_, lo, sizeof(lo_)); + return _mm_set_epi64(hi_, lo_); +} +#else +// Load 64 bits from each of hi and low, and pack into an SSE register +// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate +// the strict aliasing rule, this takes a different approach +static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi)); +} +#endif + static INLINE void xx_storel_32(void *const a, const __m128i v) { const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h index b729e5f410..7548d4d4f4 100644 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -43,6 +43,16 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) { _mm256_storeu_si256((__m256i *)a, v); } +// Fill an AVX register using an interleaved pair of values, ie. set the +// 16 channels to {a, b} repeated 8 times, using the same channel ordering +// as when a register is stored to / loaded from memory. +// +// This is useful for rearranging filter kernels for use with the _mm_madd_epi16 +// instruction +static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) { + return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); +} + // The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio // compilers. The following function is equivalent to _mm256_set1_epi64x() // acting on a 32-bit integer. @@ -61,11 +71,26 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +// _mm256_loadu2_m128i has been introduced in GCC 10.1 +#if !defined(__clang__) && GCC_VERSION < 101000 +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); + return _mm256_set_m128i(mhi, mlo); +} +#else static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); return yy_set_m128i(mhi, mlo); } +#endif + +#undef GCC_VERSION static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) { _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 046d6f10f8..0f872fc392 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -518,8 +518,8 @@ void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, } } -uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8; __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16; @@ -575,8 +575,9 @@ uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8; __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16; @@ -665,8 +666,8 @@ uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, return sum; } -uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst3_16x8; __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16; @@ -715,8 +716,9 @@ uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8; __m256i dst0_16x16, dst1_16x16; @@ -780,8 +782,8 @@ uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } @@ -795,8 +797,8 @@ uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); - case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); + case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); + case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); default: assert(0 && "unsupported width"); return -1; } } diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c index 9e9e70ea01..57a1cee781 100644 --- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -648,7 +648,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2) #endif #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ - int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + static int sub_pixel_avg_variance32x##height##_imp_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ unsigned int *sse) { \ @@ -876,7 +876,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2) const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse, \ const uint8_t *sec_ptr) { \ - const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ @@ -899,7 +899,7 @@ MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4) const uint8_t *sec_ptr = sec; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ + const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ sec_ptr, w, &sse2); \ dst_ptr += hf * dst_stride; \ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index faec9cf73d..81b30072a5 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -415,7 +415,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, DECL(8, opt); \ DECL(16, opt) -DECLS(sse2); DECLS(ssse3); #undef DECLS #undef DECL @@ -492,7 +491,6 @@ DECLS(ssse3); FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) #endif -FNS(sse2) FNS(ssse3) #undef FNS @@ -510,7 +508,6 @@ FNS(ssse3) DECL(8, opt); \ DECL(16, opt) -DECLS(sse2); DECLS(ssse3); #undef DECL #undef DECLS @@ -591,7 +588,6 @@ DECLS(ssse3); FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) #endif -FNS(sse2) FNS(ssse3) #undef FNS @@ -710,8 +706,8 @@ void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, } } -uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst_16x8; __m128i src0_16x4, src1_16x4, src_16x8; @@ -744,8 +740,8 @@ uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, return sum; } -uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst_8x8, dst_16x8; __m128i src_16x8; @@ -781,8 +777,8 @@ uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } |