summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
blob: 938fc173ffec1044db7d0872fb8b0026c9926e26 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

SECTION .text

;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
globalsym(vp8_short_walsh4x4_sse2)
sym(vp8_short_walsh4x4_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 3
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    mov     rsi, arg(0)           ; input
    mov     rdi, arg(1)           ; output
    movsxd  rdx, dword ptr arg(2) ; pitch

    ; first for loop
    movq    xmm0, MMWORD PTR [rsi]           ; load input
    movq    xmm1, MMWORD PTR [rsi + rdx]
    lea     rsi,  [rsi + rdx*2]
    movq    xmm2, MMWORD PTR [rsi]
    movq    xmm3, MMWORD PTR [rsi + rdx]

    punpcklwd xmm0,  xmm1
    punpcklwd xmm2,  xmm3

    movdqa    xmm1, xmm0
    punpckldq xmm0, xmm2           ; ip[1] ip[0]
    punpckhdq xmm1, xmm2           ; ip[3] ip[2]

    movdqa    xmm2, xmm0
    paddw     xmm0, xmm1
    psubw     xmm2, xmm1

    psllw     xmm0, 2              ; d1  a1
    psllw     xmm2, 2              ; c1  b1

    movdqa    xmm1, xmm0
    punpcklqdq xmm0, xmm2          ; b1  a1
    punpckhqdq xmm1, xmm2          ; c1  d1

    pxor      xmm6, xmm6
    movq      xmm6, xmm0
    pxor      xmm7, xmm7
    pcmpeqw   xmm7, xmm6
    paddw     xmm7, [GLOBAL(c1)]

    movdqa    xmm2, xmm0
    paddw     xmm0, xmm1           ; b1+c1  a1+d1
    psubw     xmm2, xmm1           ; b1-c1  a1-d1
    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)

    ; second for loop
    ; input: 13  9  5  1 12  8  4  0 (xmm0)
    ;        14 10  6  2 15 11  7  3 (xmm2)
    ; after shuffle:
    ;        13  5  9  1 12  4  8  0 (xmm0)
    ;        14  6 10  2 15  7 11  3 (xmm1)
    pshuflw   xmm3, xmm0, 0xd8
    pshufhw   xmm0, xmm3, 0xd8
    pshuflw   xmm3, xmm2, 0xd8
    pshufhw   xmm1, xmm3, 0xd8

    movdqa    xmm2, xmm0
    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
    movdqa    xmm3, xmm1
    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13

    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12

    movdqa    xmm0, xmm4
    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
    movdqa    xmm1, xmm6
    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12

    movdqa    xmm2, xmm0
    paddd     xmm0, xmm4            ; b21 b20 a21 a20
    psubd     xmm2, xmm4            ; c21 c20 d21 d20
    movdqa    xmm3, xmm1
    paddd     xmm1, xmm6            ; b23 b22 a23 a22
    psubd     xmm3, xmm6            ; c23 c22 d23 d22

    pxor      xmm4, xmm4
    movdqa    xmm5, xmm4
    pcmpgtd   xmm4, xmm0
    pcmpgtd   xmm5, xmm2
    pand      xmm4, [GLOBAL(cd1)]
    pand      xmm5, [GLOBAL(cd1)]

    pxor      xmm6, xmm6
    movdqa    xmm7, xmm6
    pcmpgtd   xmm6, xmm1
    pcmpgtd   xmm7, xmm3
    pand      xmm6, [GLOBAL(cd1)]
    pand      xmm7, [GLOBAL(cd1)]

    paddd     xmm0, xmm4
    paddd     xmm2, xmm5
    paddd     xmm0, [GLOBAL(cd3)]
    paddd     xmm2, [GLOBAL(cd3)]
    paddd     xmm1, xmm6
    paddd     xmm3, xmm7
    paddd     xmm1, [GLOBAL(cd3)]
    paddd     xmm3, [GLOBAL(cd3)]

    psrad     xmm0, 3
    psrad     xmm1, 3
    psrad     xmm2, 3
    psrad     xmm3, 3
    movdqa    xmm4, xmm0
    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
    movdqa    xmm5, xmm2
    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20

    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20

    movdqa  XMMWORD PTR [rdi], xmm0
    movdqa  XMMWORD PTR [rdi + 16], xmm2

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret

SECTION_RODATA
align 16
c1:
    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
align 16
cn1:
    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
align 16
cd1:
    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
align 16
cd3:
    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003