summaryrefslogtreecommitdiffstats
path: root/media/ffvpx/libavutil/arm/float_dsp_vfp.S
diff options
context:
space:
mode:
Diffstat (limited to 'media/ffvpx/libavutil/arm/float_dsp_vfp.S')
-rw-r--r--media/ffvpx/libavutil/arm/float_dsp_vfp.S457
1 files changed, 457 insertions, 0 deletions
diff --git a/media/ffvpx/libavutil/arm/float_dsp_vfp.S b/media/ffvpx/libavutil/arm/float_dsp_vfp.S
new file mode 100644
index 0000000000..7db2452284
--- /dev/null
+++ b/media/ffvpx/libavutil/arm/float_dsp_vfp.S
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+/**
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
+function ff_vector_fmul_vfp, export=1
+ vpush {d8-d15}
+ fmrx r12, fpscr
+ orr r12, r12, #(3 << 16) /* set vector size to 4 */
+ fmxr fpscr, r12
+
+ vldmia r1!, {s0-s3}
+ vldmia r2!, {s8-s11}
+ vldmia r1!, {s4-s7}
+ vldmia r2!, {s12-s15}
+ vmul.f32 s8, s0, s8
+1:
+ subs r3, r3, #16
+ vmul.f32 s12, s4, s12
+ itttt ge
+ vldmiage r1!, {s16-s19}
+ vldmiage r2!, {s24-s27}
+ vldmiage r1!, {s20-s23}
+ vldmiage r2!, {s28-s31}
+ it ge
+ vmulge.f32 s24, s16, s24
+ vstmia r0!, {s8-s11}
+ vstmia r0!, {s12-s15}
+ it ge
+ vmulge.f32 s28, s20, s28
+ itttt gt
+ vldmiagt r1!, {s0-s3}
+ vldmiagt r2!, {s8-s11}
+ vldmiagt r1!, {s4-s7}
+ vldmiagt r2!, {s12-s15}
+ ittt ge
+ vmulge.f32 s8, s0, s8
+ vstmiage r0!, {s24-s27}
+ vstmiage r0!, {s28-s31}
+ bgt 1b
+
+ bic r12, r12, #(7 << 16) /* set vector size back to 1 */
+ fmxr fpscr, r12
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+/**
+ * ARM VFP implementation of 'vector_fmul_window_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
+@ const float *src1, const float *win, int len)
+function ff_vector_fmul_window_vfp, export=1
+DST0 .req a1
+SRC0 .req a2
+SRC1 .req a3
+WIN0 .req a4
+LEN .req v1
+DST1 .req v2
+WIN1 .req v3
+OLDFPSCR .req ip
+
+ push {v1-v3,lr}
+ ldr LEN, [sp, #4*4+0]
+ vpush {s16-s31}
+ fmrx OLDFPSCR, FPSCR
+ add DST1, DST0, LEN, lsl #3
+ add SRC1, SRC1, LEN, lsl #2
+ add WIN1, WIN0, LEN, lsl #3
+
+ tst LEN, #7
+ beq 4f @ common case: len is a multiple of 8
+
+ ldr lr, =0x03000000 @ RunFast mode, scalar mode
+ fmxr FPSCR, lr
+
+ tst LEN, #1
+ beq 1f
+ vldmdb WIN1!, {s0}
+ vldmia SRC0!, {s8}
+ vldmia WIN0!, {s16}
+ vmul.f s24, s0, s8
+ vldmdb SRC1!, {s20}
+ vmul.f s8, s16, s8
+ vmls.f s24, s16, s20
+ vmla.f s8, s0, s20
+ vstmia DST0!, {s24}
+ vstmdb DST1!, {s8}
+1:
+ tst LEN, #2
+ beq 2f
+ vldmdb WIN1!, {s0}
+ vldmdb WIN1!, {s1}
+ vldmia SRC0!, {s8-s9}
+ vldmia WIN0!, {s16-s17}
+ vmul.f s24, s0, s8
+ vmul.f s25, s1, s9
+ vldmdb SRC1!, {s20}
+ vldmdb SRC1!, {s21}
+ vmul.f s8, s16, s8
+ vmul.f s9, s17, s9
+ vmls.f s24, s16, s20
+ vmls.f s25, s17, s21
+ vmla.f s8, s0, s20
+ vmla.f s9, s1, s21
+ vstmia DST0!, {s24-s25}
+ vstmdb DST1!, {s8}
+ vstmdb DST1!, {s9}
+2:
+ tst LEN, #4
+ beq 3f
+ vldmdb WIN1!, {s0}
+ vldmdb WIN1!, {s1}
+ vldmdb WIN1!, {s2}
+ vldmdb WIN1!, {s3}
+ vldmia SRC0!, {s8-s11}
+ vldmia WIN0!, {s16-s19}
+ vmul.f s24, s0, s8
+ vmul.f s25, s1, s9
+ vmul.f s26, s2, s10
+ vmul.f s27, s3, s11
+ vldmdb SRC1!, {s20}
+ vldmdb SRC1!, {s21}
+ vldmdb SRC1!, {s22}
+ vldmdb SRC1!, {s23}
+ vmul.f s8, s16, s8
+ vmul.f s9, s17, s9
+ vmul.f s10, s18, s10
+ vmul.f s11, s19, s11
+ vmls.f s24, s16, s20
+ vmls.f s25, s17, s21
+ vmls.f s26, s18, s22
+ vmls.f s27, s19, s23
+ vmla.f s8, s0, s20
+ vmla.f s9, s1, s21
+ vmla.f s10, s2, s22
+ vmla.f s11, s3, s23
+ vstmia DST0!, {s24-s27}
+ vstmdb DST1!, {s8}
+ vstmdb DST1!, {s9}
+ vstmdb DST1!, {s10}
+ vstmdb DST1!, {s11}
+3:
+ bics LEN, LEN, #7
+ beq 7f
+4:
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, lr
+
+ vldmdb WIN1!, {s0}
+ vldmdb WIN1!, {s1}
+ vldmdb WIN1!, {s2}
+ vldmdb WIN1!, {s3}
+ vldmia SRC0!, {s8-s11}
+ vldmia WIN0!, {s16-s19}
+ vmul.f s24, s0, s8 @ vector * vector
+ vldmdb SRC1!, {s20}
+ vldmdb SRC1!, {s21}
+ vldmdb SRC1!, {s22}
+ vldmdb SRC1!, {s23}
+ vmul.f s8, s16, s8 @ vector * vector
+ vmls.f s24, s16, s20 @ vector * vector
+ vldmdb WIN1!, {s4}
+ vldmdb WIN1!, {s5}
+ vldmdb WIN1!, {s6}
+ vldmdb WIN1!, {s7}
+ vldmia SRC0!, {s12-s13}
+ vmla.f s8, s0, s20 @ vector * vector
+ vldmia SRC0!, {s14-s15}
+ subs LEN, LEN, #8
+ beq 6f
+5: vldmia WIN0!, {s20-s23}
+ vmul.f s28, s4, s12 @ vector * vector
+ vstmia DST0!, {s24-s25}
+ vldmdb SRC1!, {s16}
+ vldmdb SRC1!, {s17}
+ vldmdb SRC1!, {s18}
+ vldmdb SRC1!, {s19}
+ vmul.f s12, s20, s12 @ vector * vector
+ vstmia DST0!, {s26-s27}
+ vstmdb DST1!, {s8}
+ vstmdb DST1!, {s9}
+ vstmdb DST1!, {s10}
+ vstmdb DST1!, {s11}
+ vmls.f s28, s20, s16 @ vector * vector
+ vldmdb WIN1!, {s0}
+ vldmdb WIN1!, {s1}
+ vldmdb WIN1!, {s2}
+ vldmdb WIN1!, {s3}
+ vldmia SRC0!, {s8-s9}
+ vmla.f s12, s4, s16 @ vector * vector
+ vldmia SRC0!, {s10-s11}
+ subs LEN, LEN, #8
+ vldmia WIN0!, {s16-s19}
+ vmul.f s24, s0, s8 @ vector * vector
+ vstmia DST0!, {s28-s29}
+ vldmdb SRC1!, {s20}
+ vldmdb SRC1!, {s21}
+ vldmdb SRC1!, {s22}
+ vldmdb SRC1!, {s23}
+ vmul.f s8, s16, s8 @ vector * vector
+ vstmia DST0!, {s30-s31}
+ vstmdb DST1!, {s12}
+ vstmdb DST1!, {s13}
+ vstmdb DST1!, {s14}
+ vstmdb DST1!, {s15}
+ vmls.f s24, s16, s20 @ vector * vector
+ vldmdb WIN1!, {s4}
+ vldmdb WIN1!, {s5}
+ vldmdb WIN1!, {s6}
+ vldmdb WIN1!, {s7}
+ vldmia SRC0!, {s12-s13}
+ vmla.f s8, s0, s20 @ vector * vector
+ vldmia SRC0!, {s14-s15}
+ bne 5b
+6: vldmia WIN0!, {s20-s23}
+ vmul.f s28, s4, s12 @ vector * vector
+ vstmia DST0!, {s24-s25}
+ vldmdb SRC1!, {s16}
+ vldmdb SRC1!, {s17}
+ vldmdb SRC1!, {s18}
+ vldmdb SRC1!, {s19}
+ vmul.f s12, s20, s12 @ vector * vector
+ vstmia DST0!, {s26-s27}
+ vstmdb DST1!, {s8}
+ vstmdb DST1!, {s9}
+ vstmdb DST1!, {s10}
+ vstmdb DST1!, {s11}
+ vmls.f s28, s20, s16 @ vector * vector
+ vmla.f s12, s4, s16 @ vector * vector
+ vstmia DST0!, {s28-s31}
+ vstmdb DST1!, {s12}
+ vstmdb DST1!, {s13}
+ vstmdb DST1!, {s14}
+ vstmdb DST1!, {s15}
+7:
+ fmxr FPSCR, OLDFPSCR
+ vpop {s16-s31}
+ pop {v1-v3,pc}
+
+ .unreq DST0
+ .unreq SRC0
+ .unreq SRC1
+ .unreq WIN0
+ .unreq LEN
+ .unreq OLDFPSCR
+ .unreq DST1
+ .unreq WIN1
+endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@ const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+ vpush {d8-d15}
+ add r2, r2, r3, lsl #2
+ vldmdb r2!, {s0-s3}
+ vldmia r1!, {s8-s11}
+ vldmdb r2!, {s4-s7}
+ vldmia r1!, {s12-s15}
+ vmul.f32 s8, s3, s8
+ vmul.f32 s9, s2, s9
+ vmul.f32 s10, s1, s10
+ vmul.f32 s11, s0, s11
+1:
+ subs r3, r3, #16
+ it ge
+ vldmdbge r2!, {s16-s19}
+ vmul.f32 s12, s7, s12
+ it ge
+ vldmiage r1!, {s24-s27}
+ vmul.f32 s13, s6, s13
+ it ge
+ vldmdbge r2!, {s20-s23}
+ vmul.f32 s14, s5, s14
+ it ge
+ vldmiage r1!, {s28-s31}
+ vmul.f32 s15, s4, s15
+ it ge
+ vmulge.f32 s24, s19, s24
+ it gt
+ vldmdbgt r2!, {s0-s3}
+ it ge
+ vmulge.f32 s25, s18, s25
+ vstmia r0!, {s8-s13}
+ it ge
+ vmulge.f32 s26, s17, s26
+ it gt
+ vldmiagt r1!, {s8-s11}
+ itt ge
+ vmulge.f32 s27, s16, s27
+ vmulge.f32 s28, s23, s28
+ it gt
+ vldmdbgt r2!, {s4-s7}
+ it ge
+ vmulge.f32 s29, s22, s29
+ vstmia r0!, {s14-s15}
+ ittt ge
+ vmulge.f32 s30, s21, s30
+ vmulge.f32 s31, s20, s31
+ vmulge.f32 s8, s3, s8
+ it gt
+ vldmiagt r1!, {s12-s15}
+ itttt ge
+ vmulge.f32 s9, s2, s9
+ vmulge.f32 s10, s1, s10
+ vstmiage r0!, {s24-s27}
+ vmulge.f32 s11, s0, s11
+ it ge
+ vstmiage r0!, {s28-s31}
+ bgt 1b
+
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+/**
+ * ARM VFP implementation of 'butterflies_float_c' function
+ * Assume that len is a positive non-zero number
+ */
+@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
+function ff_butterflies_float_vfp, export=1
+BASE1 .req a1
+BASE2 .req a2
+LEN .req a3
+OLDFPSCR .req a4
+
+ vpush {s16-s31}
+ fmrx OLDFPSCR, FPSCR
+
+ tst LEN, #7
+ beq 4f @ common case: len is a multiple of 8
+
+ ldr ip, =0x03000000 @ RunFast mode, scalar mode
+ fmxr FPSCR, ip
+
+ tst LEN, #1
+ beq 1f
+ vldmia BASE1!, {s0}
+ vldmia BASE2!, {s8}
+ vadd.f s16, s0, s8
+ vsub.f s24, s0, s8
+ vstr s16, [BASE1, #0-4*1]
+ vstr s24, [BASE2, #0-4*1]
+1:
+ tst LEN, #2
+ beq 2f
+ vldmia BASE1!, {s0-s1}
+ vldmia BASE2!, {s8-s9}
+ vadd.f s16, s0, s8
+ vadd.f s17, s1, s9
+ vsub.f s24, s0, s8
+ vsub.f s25, s1, s9
+ vstr d8, [BASE1, #0-8*1] @ s16,s17
+ vstr d12, [BASE2, #0-8*1] @ s24,s25
+2:
+ tst LEN, #4
+ beq 3f
+ vldmia BASE1!, {s0-s1}
+ vldmia BASE2!, {s8-s9}
+ vldmia BASE1!, {s2-s3}
+ vldmia BASE2!, {s10-s11}
+ vadd.f s16, s0, s8
+ vadd.f s17, s1, s9
+ vsub.f s24, s0, s8
+ vsub.f s25, s1, s9
+ vadd.f s18, s2, s10
+ vadd.f s19, s3, s11
+ vsub.f s26, s2, s10
+ vsub.f s27, s3, s11
+ vstr d8, [BASE1, #0-16*1] @ s16,s17
+ vstr d12, [BASE2, #0-16*1] @ s24,s25
+ vstr d9, [BASE1, #8-16*1] @ s18,s19
+ vstr d13, [BASE2, #8-16*1] @ s26,s27
+3:
+ bics LEN, LEN, #7
+ beq 7f
+4:
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, ip
+
+ vldmia BASE1!, {s0-s1}
+ vldmia BASE2!, {s8-s9}
+ vldmia BASE1!, {s2-s3}
+ vldmia BASE2!, {s10-s11}
+ vadd.f s16, s0, s8
+ vldmia BASE1!, {s4-s5}
+ vldmia BASE2!, {s12-s13}
+ vldmia BASE1!, {s6-s7}
+ vldmia BASE2!, {s14-s15}
+ vsub.f s24, s0, s8
+ vadd.f s20, s4, s12
+ subs LEN, LEN, #8
+ beq 6f
+5: vldmia BASE1!, {s0-s3}
+ vldmia BASE2!, {s8-s11}
+ vsub.f s28, s4, s12
+ vstr d8, [BASE1, #0-16*3] @ s16,s17
+ vstr d9, [BASE1, #8-16*3] @ s18,s19
+ vstr d12, [BASE2, #0-16*3] @ s24,s25
+ vstr d13, [BASE2, #8-16*3] @ s26,s27
+ vadd.f s16, s0, s8
+ vldmia BASE1!, {s4-s7}
+ vldmia BASE2!, {s12-s15}
+ vsub.f s24, s0, s8
+ vstr d10, [BASE1, #0-16*3] @ s20,s21
+ vstr d11, [BASE1, #8-16*3] @ s22,s23
+ vstr d14, [BASE2, #0-16*3] @ s28,s29
+ vstr d15, [BASE2, #8-16*3] @ s30,s31
+ vadd.f s20, s4, s12
+ subs LEN, LEN, #8
+ bne 5b
+6: vsub.f s28, s4, s12
+ vstr d8, [BASE1, #0-16*2] @ s16,s17
+ vstr d9, [BASE1, #8-16*2] @ s18,s19
+ vstr d12, [BASE2, #0-16*2] @ s24,s25
+ vstr d13, [BASE2, #8-16*2] @ s26,s27
+ vstr d10, [BASE1, #0-16*1] @ s20,s21
+ vstr d11, [BASE1, #8-16*1] @ s22,s23
+ vstr d14, [BASE2, #0-16*1] @ s28,s29
+ vstr d15, [BASE2, #8-16*1] @ s30,s31
+7:
+ fmxr FPSCR, OLDFPSCR
+ vpop {s16-s31}
+ bx lr
+
+ .unreq BASE1
+ .unreq BASE2
+ .unreq LEN
+ .unreq OLDFPSCR
+endfunc