summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm297
1 files changed, 297 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
new file mode 100644
index 0000000000..7283c32b8c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -0,0 +1,297 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+ dst, dst_stride, \
+ fx, fxs, fy, fys, w, h
+%endif
+ mov r4d, dword wm
+%ifidn %2, highbd
+ shl r4d, 1
+ shl srcq, 1
+ shl src_strideq, 1
+ shl dstq, 1
+ shl dst_strideq, 1
+%else
+ cmp r4d, 4
+ je .w4
+%endif
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+
+ cmp r4d, 64
+ je .w64
+%ifidn %2, highbd
+ cmp r4d, 128
+ je .w128
+
+.w256:
+ mov r4d, dword hm
+.loop256:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ movu m0, [srcq+128]
+ movu m1, [srcq+128+16]
+ movu m2, [srcq+128+32]
+ movu m3, [srcq+128+48]
+%ifidn %1, avg
+ pavg m0, [dstq+128]
+ pavg m1, [dstq+128+16]
+ pavg m2, [dstq+128+32]
+ pavg m3, [dstq+128+48]
+%endif
+ mova [dstq+128 ], m0
+ mova [dstq+128+16], m1
+ mova [dstq+128+32], m2
+ mova [dstq+128+48], m3
+ movu m0, [srcq+128+64]
+ movu m1, [srcq+128+80]
+ movu m2, [srcq+128+96]
+ movu m3, [srcq+128+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+128+64]
+ pavg m1, [dstq+128+80]
+ pavg m2, [dstq+128+96]
+ pavg m3, [dstq+128+112]
+%endif
+ mova [dstq+128+64], m0
+ mova [dstq+128+80], m1
+ mova [dstq+128+96], m2
+ mova [dstq+128+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop256
+ RET
+%endif
+
+.w128:
+ mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop128
+ RET
+
+.w64:
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ sub r4d, 1
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq +16]
+ pavg m2, [dstq+dst_strideq]
+ pavg m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+dst_strideq]
+ pavg m2, [dstq+dst_strideq*2]
+ pavg m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movh m4, [dstq]
+ movh m5, [dstq+dst_strideq]
+ movh m6, [dstq+dst_strideq*2]
+ movh m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+%ifnidn %2, highbd
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movd m0, [srcq]
+ movd m1, [srcq+src_strideq]
+ movd m2, [srcq+src_strideq*2]
+ movd m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ movd m4, [dstq]
+ movd m5, [dstq+dst_strideq]
+ movd m6, [dstq+dst_strideq*2]
+ movd m7, [dstq+r6q]
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
+%endif
+ movd [dstq ], m0
+ movd [dstq+dst_strideq ], m1
+ movd [dstq+dst_strideq*2], m2
+ movd [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+convolve_fn copy, highbd