summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm')
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm486
1 files changed, 486 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..d310a83dad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -0,0 +1,486 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END