diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c | 73 |
1 files changed, 27 insertions, 46 deletions
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c index c314c0a192..df6e4d2ab5 100644 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c @@ -15,7 +15,6 @@ #include <string.h> #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -246,7 +245,6 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -260,31 +258,25 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + uint8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + uint8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -294,12 +286,12 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - d3 = convolve8_4_usdot_partial(s3456, s78910, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_usdot_partial(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_usdot_partial(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_usdot_partial(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); @@ -317,29 +309,21 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, @@ -348,16 +332,13 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); @@ -374,14 +355,14 @@ void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filter); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filter); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filter); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filter); + uint8x8_t d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, + s4567_hi, filter); + uint8x8_t d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, + s5678_hi, filter); + uint8x8_t d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, + s6789_hi, filter); + uint8x8_t d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, + s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); |