diff options
Diffstat (limited to 'third_party/dav1d/src/x86/cdef_sse.asm')
-rw-r--r-- | third_party/dav1d/src/x86/cdef_sse.asm | 1357 |
1 files changed, 1357 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef_sse.asm b/third_party/dav1d/src/x86/cdef_sse.asm new file mode 100644 index 0000000000..1b353121f4 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_sse.asm @@ -0,0 +1,1357 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2019, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%macro DUP8 1-* + %rep %0 + times 8 db %1 + %rotate 1 + %endrep +%endmacro + +div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 + dd 420, 210, 140, 105, 105, 105, 105, 105 +div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 + dw 168, 168, 140, 140, 120, 120, 105, 105 + dw 420, 420, 210, 210, 140, 140, 105, 105 + dw 105, 105, 105, 105, 105, 105, 105, 105 +const shufw_6543210x, \ + db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_8: times 8 dw 8 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 +pw_0x7FFF: times 8 dw 0x7FFF +pw_0x8000: times 8 dw 0x8000 +tap_table: ; masks for 8-bit shift emulation + DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 + ; weights + DUP8 4, 2, 3, 3, 2, 1 + ; taps indices + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +SECTION .text + +%macro movif32 2 + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +%macro PMOVZXBW 2-3 0 ; %3 = half + %if cpuflag(sse4) && %3 == 0 + pmovzxbw %1, %2 + %else + %if %3 == 1 + movd %1, %2 + %else + movq %1, %2 + %endif + punpcklbw %1, m7 + %endif +%endmacro + +%macro PSHUFB_0 2 + %if cpuflag(ssse3) + pshufb %1, %2 + %else + punpcklbw %1, %1 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 + %endif +%endmacro + +%macro MOVDDUP 2 +%if cpuflag(ssse3) + movddup %1, %2 +%else + movq %1, %2 + punpcklqdq %1, %1 +%endif +%endmacro + +%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax + ; load p0/p1 + movsx offq, byte [dirq+kq+%1+14*8] ; off1 + %if %6 == 4 + movq m5, [stkq+offq*2+32*0] ; p0 + movhps m5, [stkq+offq*2+32*1] + %else + movu m5, [stkq+offq*2+32*0] ; p0 + %endif + neg offq ; -off1 + %if %6 == 4 + movq m6, [stkq+offq*2+32*0] ; p1 + movhps m6, [stkq+offq*2+32*1] + %else + movu m6, [stkq+offq*2+32*0] ; p1 + %endif + %if %7 + %if cpuflag(sse4) + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 + pminuw m8, m5 + pmaxsw m7, m6 + pminuw m8, m6 + %else + pcmpeqw m3, m14, m5 + pminsw m8, m5 ; min after p0 + pandn m3, m5 + pmaxsw m7, m3 ; max after p0 + pcmpeqw m3, m14, m6 + pminsw m8, m6 ; min after p1 + pandn m3, m6 + pmaxsw m7, m3 ; max after p1 + %endif + %endif + + ; accumulate sum[m13] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + packsswb m5, m6 ; convert pixel diff to 8-bit + %if cpuflag(ssse3) + pshufb m5, m13 ; group diffs p0 and p1 into pairs + pabsb m6, m5 + psignb m3, %5, m5 + %else + movlhps m6, m5 + punpckhbw m6, m5 + pxor m5, m5 + pcmpgtb m5, m6 + paddb m6, m5 + pxor m6, m5 + paddb m3, %5, m5 + pxor m3, m5 + %endif + pand m9, %3, m6 ; emulate 8-bit shift + psrlw m9, %2 + psubusb m5, %4, m9 + pminub m5, m6 ; constrain(diff_p) + %if cpuflag(ssse3) + pmaddubsw m5, m3 ; constrain(diff_p) * taps + %else + psrlw m9, m5, 8 + psraw m6, m3, 8 + psllw m5, 8 + psllw m3, 8 + pmullw m9, m6 + pmulhw m5, m3 + paddw m5, m9 + %endif + paddw m0, m5 +%endmacro + +%macro LOAD_BODY 3 ; dst, src, block_width + %if %3 == 4 + PMOVZXBW m0, [%2+strideq*0] + PMOVZXBW m1, [%2+strideq*1] + PMOVZXBW m2, [%2+strideq*2] + PMOVZXBW m3, [%2+stride3q] + mova [%1+32*0], m0 + mova [%1+32*1], m1 + mova [%1+32*2], m2 + mova [%1+32*3], m3 + %else + movu m0, [%2+strideq*0] + movu m1, [%2+strideq*1] + movu m2, [%2+strideq*2] + movu m3, [%2+stride3q] + punpcklbw m4, m0, m7 + punpckhbw m0, m7 + mova [%1+32*0+ 0], m4 + mova [%1+32*0+16], m0 + punpcklbw m4, m1, m7 + punpckhbw m1, m7 + mova [%1+32*1+ 0], m4 + mova [%1+32*1+16], m1 + punpcklbw m4, m2, m7 + punpckhbw m2, m7 + mova [%1+32*2+ 0], m4 + mova [%1+32*2+16], m2 + punpcklbw m4, m3, m7 + punpckhbw m3, m7 + mova [%1+32*3+ 0], m4 + mova [%1+32*3+16], m3 + %endif +%endmacro + +%macro CDEF_FILTER_END 2 ; w, minmax + pxor m6, m6 + pcmpgtw m6, m0 + paddw m0, m6 + %if cpuflag(ssse3) + pmulhrsw m0, m15 + %else + paddw m0, m15 + psraw m0, 4 + %endif + paddw m4, m0 + %if %2 + pminsw m4, m7 + pmaxsw m4, m8 + %endif + packuswb m4, m4 + %if %1 == 4 + movd [dstq+strideq*0], m4 + psrlq m4, 32 + movd [dstq+strideq*1], m4 + add stkq, 32*2 + lea dstq, [dstq+strideq*2] + %else + movq [dstq], m4 + add stkq, 32 + add dstq, strideq + %endif +%endmacro + +%macro CDEF_FILTER 2 ; w, h + %if ARCH_X86_64 +cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ + dst, stride, left, top, bot, pri, dst4, edge, \ + stride3 + %define px rsp+3*16+2*32 + %define base 0 + %else +cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ + dst, stride, left, edge, stride3 + %define topq r2 + %define botq r2 + %define dst4q r2 + LEA r5, tap_table + %define px esp+7*16+2*32 + %define base r5-tap_table + %endif + mov edged, r9m + %if cpuflag(sse4) + %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] + %else + %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] + %endif + mova m6, OUT_OF_BOUNDS_MEM + pxor m7, m7 + + ; prepare pixel buffers - body/right + %if %2 == 8 + lea dst4q, [dstq+strideq*4] + %endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + LOAD_BODY px, dstq, %1 + %if %2 == 8 + LOAD_BODY px+4*32, dst4q, %1 + %endif + jmp .body_done +.no_right: + PMOVZXBW m0, [dstq+strideq*0], %1 == 4 + PMOVZXBW m1, [dstq+strideq*1], %1 == 4 + PMOVZXBW m2, [dstq+strideq*2], %1 == 4 + PMOVZXBW m3, [dstq+stride3q ], %1 == 4 + mova [px+32*0], m0 + mova [px+32*1], m1 + mova [px+32*2], m2 + mova [px+32*3], m3 + movd [px+32*0+%1*2], m6 + movd [px+32*1+%1*2], m6 + movd [px+32*2+%1*2], m6 + movd [px+32*3+%1*2], m6 + %if %2 == 8 + PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 + PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 + PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 + PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 + mova [px+32*4], m0 + mova [px+32*5], m1 + mova [px+32*6], m2 + mova [px+32*7], m3 + movd [px+32*4+%1*2], m6 + movd [px+32*5+%1*2], m6 + movd [px+32*6+%1*2], m6 + movd [px+32*7+%1*2], m6 + %endif +.body_done: + + ; top + movifnidn topq, r3mp + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0-2] + PMOVZXBW m1, [topq+strideq*1-2] + %else + movu m0, [topq+strideq*0-4] + movu m1, [topq+strideq*1-4] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px-32*2+8], m2 + movu [px-32*1+8], m3 + %endif + movu [px-32*2-%1], m0 + movu [px-32*1-%1], m1 + jmp .top_done +.top_no_right: + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0-%1] + PMOVZXBW m1, [topq+strideq*1-%1] + movu [px-32*2-8], m0 + movu [px-32*1-8], m1 + %else + movu m0, [topq+strideq*0-%1] + movu m1, [topq+strideq*1-%2] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px-32*2-16], m0 + mova [px-32*2+ 0], m2 + mova [px-32*1-16], m1 + mova [px-32*1+ 0], m3 + %endif + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0] + PMOVZXBW m1, [topq+strideq*1] + %else + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movd [px-32*2+16], m2 + movd [px-32*1+16], m3 + %endif + movd [px-32*2- 4], m6 + movd [px-32*1- 4], m6 + mova [px-32*2+ 0], m0 + mova [px-32*1+ 0], m1 + jmp .top_done +.top_no_left_right: + PMOVZXBW m0, [topq+strideq*0], %1 == 4 + PMOVZXBW m1, [topq+strideq*1], %1 == 4 + movd [px-32*2-4], m6 + movd [px-32*1-4], m6 + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 + jmp .top_done +.no_top: + movu [px-32*2- 4], m6 + movu [px-32*1- 4], m6 + %if %1 == 8 + movq [px-32*2+12], m6 + movq [px-32*1+12], m6 + %endif +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + movifnidn leftq, leftmp + %if %2 == 4 + movq m0, [leftq] + %else + movu m0, [leftq] + %endif + %if %2 == 4 + punpcklbw m0, m7 + %else + punpckhbw m1, m0, m7 + punpcklbw m0, m7 + movhlps m3, m1 + movd [px+32*4-4], m1 + movd [px+32*6-4], m3 + psrlq m1, 32 + psrlq m3, 32 + movd [px+32*5-4], m1 + movd [px+32*7-4], m3 + %endif + movhlps m2, m0 + movd [px+32*0-4], m0 + movd [px+32*2-4], m2 + psrlq m0, 32 + psrlq m2, 32 + movd [px+32*1-4], m0 + movd [px+32*3-4], m2 + jmp .left_done +.no_left: + movd [px+32*0-4], m6 + movd [px+32*1-4], m6 + movd [px+32*2-4], m6 + movd [px+32*3-4], m6 + %if %2 == 8 + movd [px+32*4-4], m6 + movd [px+32*5-4], m6 + movd [px+32*6-4], m6 + movd [px+32*7-4], m6 + %endif +.left_done: + + ; bottom + movifnidn botq, r4mp + test edgeb, 8 ; have_bottom + jz .no_bottom + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + %if %1 == 4 + PMOVZXBW m0, [botq+strideq*0-(%1/2)] + PMOVZXBW m1, [botq+strideq*1-(%1/2)] + %else + movu m0, [botq+strideq*0-4] + movu m1, [botq+strideq*1-4] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px+32*(%2+0)+8], m2 + movu [px+32*(%2+1)+8], m3 + %endif + movu [px+32*(%2+0)-%1], m0 + movu [px+32*(%2+1)-%1], m1 + jmp .bottom_done +.bottom_no_right: + %if %1 == 4 + PMOVZXBW m0, [botq+strideq*0-4] + PMOVZXBW m1, [botq+strideq*1-4] + movu [px+32*(%2+0)-8], m0 + movu [px+32*(%2+1)-8], m1 + %else + movu m0, [botq+strideq*0-8] + movu m1, [botq+strideq*1-8] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)-16], m0 + mova [px+32*(%2+0)+ 0], m2 + mova [px+32*(%2+1)-16], m1 + mova [px+32*(%2+1)+ 0], m3 + movd [px+32*(%2-1)+16], m6 ; overwritten by first mova + %endif + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + %if %1 == 4 + PMOVZXBW m0, [botq+strideq*0] + PMOVZXBW m1, [botq+strideq*1] + %else + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)+16], m2 + mova [px+32*(%2+1)+16], m3 + %endif + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 + jmp .bottom_done +.bottom_no_left_right: + PMOVZXBW m0, [botq+strideq*0], %1 == 4 + PMOVZXBW m1, [botq+strideq*1], %1 == 4 + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 + jmp .bottom_done +.no_bottom: + movu [px+32*(%2+0)- 4], m6 + movu [px+32*(%2+1)- 4], m6 + %if %1 == 8 + movq [px+32*(%2+0)+12], m6 + movq [px+32*(%2+1)+12], m6 + %endif +.bottom_done: + + ; actual filter + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec + mova m13, [shufb_lohi] + %if cpuflag(ssse3) + mova m15, [pw_2048] + %else + mova m15, [pw_8] + %endif + mova m14, m6 + %else + DEFINE_ARGS dst, pridmp, sec, damping, pri, tap + %xdefine m8 m1 + %xdefine m9 m2 + %xdefine m10 m0 + %xdefine m13 [base+shufb_lohi] + %xdefine m14 OUT_OF_BOUNDS_MEM + %if cpuflag(ssse3) + %xdefine m15 [base+pw_2048] + %else + %xdefine m15 [base+pw_8] + %endif + %endif + movifnidn prid, r5m + movifnidn secd, r6m + mov dampingd, r8m + movif32 [esp+0x3C], r1d + test prid, prid + jz .sec_only + movd m1, r5m + bsr pridmpd, prid + test secd, secd + jz .pri_only + movd m10, r6m + tzcnt secd, secd + and prid, 1 + sub pridmpd, dampingd + sub secd, dampingd + xor dampingd, dampingd + add prid, prid + neg pridmpd + cmovs pridmpd, dampingd + neg secd + PSHUFB_0 m1, m7 + PSHUFB_0 m10, m7 + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec + lea tapq, [tap_table] + MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask + MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask + mov [rsp+0x00], pridmpq ; pri_shift + mov [rsp+0x10], secq ; sec_shift + DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off + %else + MOVDDUP m2, [tapq+pridmpq*8] + MOVDDUP m3, [tapq+secq*8] + mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw + mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP + mov [esp+0x00], pridmpd + mov [esp+0x30], secd + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %define offq dstq + %define kd strided + %define kq strideq + mova [esp+0x10], m2 + mova [esp+0x40], m3 + mova [esp+0x20], m1 + mova [esp+0x50], m10 + %endif + mov dird, r7m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] ; pri_taps + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] ; px + %endif + pxor m0, m0 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + MOVDDUP m2, [priq+kq*8] + %if ARCH_X86_64 + ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 + ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 + %else + ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + %endif + dec kd + jge .k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 1 + dec hd + jg .v_loop + RET + +.pri_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap + lea tapq, [tap_table] + %else + DEFINE_ARGS dst, pridmp, zero, damping, pri, tap + %endif + and prid, 1 + xor zerod, zerod + sub dampingd, pridmpd + cmovs dampingd, zerod + add prid, prid + PSHUFB_0 m1, m7 + MOVDDUP m7, [tapq+dampingq*8] + mov [rsp+0x00], dampingq + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off + %else + mov [rsp+0x04], zerod + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %endif + mov dird, r7m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.pri_v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] + %endif + pxor m0, m0 +.pri_k_loop: + MOVDDUP m2, [priq+kq*8] + ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .pri_k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .pri_v_loop + RET + +.sec_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec +%else + DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero +%endif + movd m1, r6m + tzcnt secd, secd + mov dird, r7m + xor zerod, zerod + sub dampingd, secd + cmovs dampingd, zerod + PSHUFB_0 m1, m7 + %if ARCH_X86_64 + lea tapq, [tap_table] + %else + mov [rsp+0x04], zerod + %endif + mov [rsp+0x00], dampingq + MOVDDUP m7, [tapq+dampingq*8] + lea dirq, [tapq+dirq*2] + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k + %else + DEFINE_ARGS dst, stride, off, stk, dir, tap, h + %endif + lea stkq, [px] + mov hd, %1*%2/8 +.sec_v_loop: + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] + %endif + pxor m0, m0 +.sec_k_loop: + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 + %if ARCH_X86_32 + MOVDDUP m2, [tapq+12*8+kq*8] + %endif + ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .sec_k_loop + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .sec_v_loop + RET +%endmacro + +%macro MULLD 2 + %if cpuflag(sse4) + pmulld %1, %2 + %else + %if ARCH_X86_32 + %define m15 m1 + %endif + pmulhuw m15, %1, %2 + pmullw %1, %2 + pslld m15, 16 + paddd %1, m15 + %endif +%endmacro + +%macro CDEF_DIR 0 + %if ARCH_X86_64 +cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var + lea r6, [strideq*3] + movq m1, [srcq+strideq*0] + movhps m1, [srcq+strideq*1] + movq m3, [srcq+strideq*2] + movhps m3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+r6 ] + + pxor m8, m8 + psadbw m9, m1, m8 + psadbw m2, m3, m8 + psadbw m4, m5, m8 + psadbw m6, m7, m8 + packssdw m9, m2 + packssdw m4, m6 + packssdw m9, m4 + + punpcklbw m0, m1, m8 + punpckhbw m1, m8 + punpcklbw m2, m3, m8 + punpckhbw m3, m8 + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + punpcklbw m6, m7, m8 + punpckhbw m7, m8 +cglobal_label .main + mova m8, [pw_128] + psubw m0, m8 + psubw m1, m8 + psubw m2, m8 + psubw m3, m8 + psubw m4, m8 + psubw m5, m8 + psubw m6, m8 + psubw m7, m8 + psllw m8, 3 + psubw m9, m8 ; partial_sum_hv[0] + + paddw m8, m0, m1 + paddw m10, m2, m3 + paddw m8, m4 + paddw m10, m5 + paddw m8, m6 + paddw m10, m7 + paddw m8, m10 ; partial_sum_hv[1] + + pmaddwd m8, m8 + pmaddwd m9, m9 + phaddd m9, m8 + SWAP m8, m9 + MULLD m8, [div_table%+SUFFIX+48] + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m0 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 ; partial_sum_diag[0] top/right half + paddw m9, m11 ; partial_sum_diag[0] top/left half + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 ; partial_sum_diag[0][0-7] + paddw m10, m14 ; partial_sum_diag[0][8-14,zero] + pshufb m10, [shufw_6543210x] + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + MULLD m11, [div_table%+SUFFIX+16] + MULLD m9, [div_table%+SUFFIX+0] + paddd m9, m11 ; cost[0a-d] + + pslldq m10, m0, 14 + psrldq m11, m0, 2 + pslldq m12, m1, 12 + psrldq m13, m1, 4 + pslldq m14, m2, 10 + psrldq m15, m2, 6 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m3, 8 + psrldq m13, m3, 8 + pslldq m14, m4, 6 + psrldq m15, m4, 10 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m5, 4 + psrldq m13, m5, 12 + pslldq m14, m6, 2 + psrldq m15, m6, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 ; partial_sum_diag[1][8-14,zero] + paddw m10, m7 ; partial_sum_diag[1][0-7] + pshufb m11, [shufw_6543210x] + punpckhwd m12, m10, m11 + punpcklwd m10, m11 + pmaddwd m12, m12 + pmaddwd m10, m10 + MULLD m12, [div_table%+SUFFIX+16] + MULLD m10, [div_table%+SUFFIX+0] + paddd m10, m12 ; cost[4a-d] + phaddd m9, m10 ; cost[0a/b,4a/b] + + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) + pslldq m4, m11, 2 + psrldq m5, m11, 14 + pslldq m6, m12, 4 + psrldq m7, m12, 12 + pslldq m14, m13, 6 + psrldq m15, m13, 10 + paddw m4, m10 + paddw m5, m7 + paddw m4, m6 + paddw m5, m15 ; partial_sum_alt[3] right + paddw m4, m14 ; partial_sum_alt[3] left + pshuflw m6, m5, q3012 + punpckhwd m5, m4 + punpcklwd m4, m6 + pmaddwd m5, m5 + pmaddwd m4, m4 + MULLD m5, [div_table%+SUFFIX+48] + MULLD m4, [div_table%+SUFFIX+32] + paddd m4, m5 ; cost[7a-d] + + pslldq m5, m10, 6 + psrldq m6, m10, 10 + pslldq m7, m11, 4 + psrldq m10, m11, 12 + pslldq m11, m12, 2 + psrldq m12, 14 + paddw m5, m7 + paddw m6, m10 + paddw m5, m11 + paddw m6, m12 + paddw m5, m13 + pshuflw m7, m6, q3012 + punpckhwd m6, m5 + punpcklwd m5, m7 + pmaddwd m6, m6 + pmaddwd m5, m5 + MULLD m6, [div_table%+SUFFIX+48] + MULLD m5, [div_table%+SUFFIX+32] + paddd m5, m6 ; cost[5a-d] + + pslldq m6, m1, 2 + psrldq m7, m1, 14 + pslldq m10, m2, 4 + psrldq m11, m2, 12 + pslldq m12, m3, 6 + psrldq m13, m3, 10 + paddw m6, m0 + paddw m7, m11 + paddw m6, m10 + paddw m7, m13 ; partial_sum_alt[3] right + paddw m6, m12 ; partial_sum_alt[3] left + pshuflw m10, m7, q3012 + punpckhwd m7, m6 + punpcklwd m6, m10 + pmaddwd m7, m7 + pmaddwd m6, m6 + MULLD m7, [div_table%+SUFFIX+48] + MULLD m6, [div_table%+SUFFIX+32] + paddd m6, m7 ; cost[1a-d] + + pshufd m0, m0, q1032 + pshufd m1, m1, q1032 + pshufd m2, m2, q1032 + pshufd m3, m3, q1032 + + pslldq m10, m0, 6 + psrldq m11, m0, 10 + pslldq m12, m1, 4 + psrldq m13, m1, 12 + pslldq m14, m2, 2 + psrldq m2, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m2 + paddw m10, m3 + pshuflw m12, m11, q3012 + punpckhwd m11, m10 + punpcklwd m10, m12 + pmaddwd m11, m11 + pmaddwd m10, m10 + MULLD m11, [div_table%+SUFFIX+48] + MULLD m10, [div_table%+SUFFIX+32] + paddd m10, m11 ; cost[3a-d] + + phaddd m9, m8 ; cost[0,4,2,6] + phaddd m6, m10 + phaddd m5, m4 + phaddd m6, m5 ; cost[1,3,5,7] + pshufd m4, m9, q3120 + + ; now find the best cost + %if cpuflag(sse4) + pmaxsd m9, m6 + pshufd m0, m9, q1032 + pmaxsd m0, m9 + pshufd m1, m0, q2301 + pmaxsd m0, m1 ; best cost + %else + pcmpgtd m0, m9, m6 + pand m9, m0 + pandn m0, m6 + por m9, m0 + pshufd m1, m9, q1032 + pcmpgtd m0, m9, m1 + pand m9, m0 + pandn m0, m1 + por m9, m0 + pshufd m1, m9, q2301 + pcmpgtd m0, m9, m1 + pand m9, m0 + pandn m0, m1 + por m0, m9 + %endif + + ; get direction and variance + punpckhdq m1, m4, m6 + punpckldq m4, m6 + psubd m2, m0, m1 + psubd m3, m0, m4 +%if WIN64 + WIN64_RESTORE_XMM + %define tmp rsp+stack_offset+8 +%else + %define tmp rsp-40 +%endif + mova [tmp+0x00], m2 ; emulate ymm in stack + mova [tmp+0x10], m3 + pcmpeqd m1, m0 ; compute best cost mask + pcmpeqd m4, m0 + packssdw m4, m1 + pmovmskb eax, m4 ; get byte-idx from mask + tzcnt eax, eax + mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm + shr eax, 1 ; get direction by converting byte-idx to word-idx + shr r1d, 10 + mov [varq], r1d + %else +cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 +%define base r2-shufw_6543210x + LEA r2, shufw_6543210x + pxor m0, m0 + lea stride3q, [strideq*3] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + mova m1, [base+pw_128] + psadbw m2, m5, m0 + psadbw m3, m7, m0 + packssdw m2, m3 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + + mova [esp+0x00], m4 + mova [esp+0x10], m5 + mova [esp+0x20], m6 + mova [esp+0x50], m7 + + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + psadbw m3, m5, m0 + psadbw m0, m7 + packssdw m3, m0 + pxor m0, m0 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 +cglobal_label .main + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + packssdw m2, m3 + psllw m1, 3 + psubw m2, m1 ; partial_sum_hv[0] + pmaddwd m2, m2 + + mova m3, [esp+0x50] + mova m0, [esp+0x00] + paddw m0, [esp+0x10] + paddw m1, m3, [esp+0x20] + paddw m0, m4 + paddw m1, m5 + paddw m0, m6 + paddw m1, m7 + paddw m0, m1 ; partial_sum_hv[1] + pmaddwd m0, m0 + + phaddd m2, m0 + MULLD m2, [base+div_table%+SUFFIX+48] + mova [esp+0x30], m2 + + mova m1, [esp+0x10] + pslldq m0, m1, 2 + psrldq m1, 14 + paddw m0, [esp+0x00] + pslldq m2, m3, 6 + psrldq m3, 10 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 4 + psrldq m3, 12 + paddw m0, m2 ; partial_sum_diag[0] top/left half + paddw m1, m3 ; partial_sum_diag[0] top/right half + pslldq m2, m4, 8 + psrldq m3, m4, 8 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 10 + psrldq m3, m5, 6 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 12 + psrldq m3, m6, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m7, 14 + psrldq m3, m7, 2 + paddw m0, m2 ; partial_sum_diag[0][0-7] + paddw m1, m3 ; partial_sum_diag[0][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [base+shufw_6543210x] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] + paddd m0, m2 ; cost[0a-d] + mova [esp+0x40], m0 + + mova m1, [esp+0x00] + pslldq m0, m1, 14 + psrldq m1, 2 + paddw m0, m7 + pslldq m2, m3, 8 + psrldq m3, 8 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 10 + psrldq m3, 6 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x10] + pslldq m2, m3, 12 + psrldq m3, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m4, 6 + psrldq m3, m4, 10 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 4 + psrldq m3, m5, 12 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 2 + psrldq m3, m6, 14 + paddw m0, m2 ; partial_sum_diag[1][0-7] + paddw m1, m3 ; partial_sum_diag[1][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [base+shufw_6543210x] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] + paddd m0, m2 ; cost[4a-d] + phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] + phaddd m1, [esp+0x30] ; cost[0,4,2,6] + mova [esp+0x30], m1 + + phaddw m0, [esp+0x00], m4 + phaddw m1, [esp+0x10], m5 + paddw m4, m5 + mova m2, [esp+0x20] + paddw m5, m2, m3 + phaddw m2, m6 + paddw m6, m7 + phaddw m3, m7 + mova m7, [esp+0x00] + paddw m7, [esp+0x10] + mova [esp+0x00], m0 + mova [esp+0x10], m1 + mova [esp+0x20], m2 + + pslldq m1, m4, 4 + pslldq m2, m6, 6 + pslldq m0, m5, 2 + paddw m1, m2 + paddw m0, m7 + psrldq m2, m5, 14 + paddw m0, m1 ; partial_sum_alt[3] left + psrldq m1, m4, 12 + paddw m1, m2 + psrldq m2, m6, 10 + paddw m1, m2 ; partial_sum_alt[3] right + pshuflw m1, m1, q3012 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m2 ; cost[7a-d] + mova [esp+0x40], m0 + + pslldq m0, m7, 6 + psrldq m7, 10 + pslldq m1, m5, 4 + psrldq m5, 12 + pslldq m2, m4, 2 + psrldq m4, 14 + paddw m0, m6 + paddw m7, m5 + paddw m0, m1 + paddw m7, m4 + paddw m0, m2 + pshuflw m2, m7, q3012 + punpckhwd m7, m0 + punpcklwd m0, m2 + pmaddwd m7, m7 + pmaddwd m0, m0 + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m7 ; cost[5a-d] + mova [esp+0x50], m0 + + mova m7, [esp+0x10] + mova m2, [esp+0x20] + pslldq m0, m7, 2 + psrldq m7, 14 + pslldq m4, m2, 4 + psrldq m2, 12 + pslldq m5, m3, 6 + psrldq m6, m3, 10 + paddw m0, [esp+0x00] + paddw m7, m2 + paddw m4, m5 + paddw m7, m6 ; partial_sum_alt[3] right + paddw m0, m4 ; partial_sum_alt[3] left + pshuflw m2, m7, q3012 + punpckhwd m7, m0 + punpcklwd m0, m2 + pmaddwd m7, m7 + pmaddwd m0, m0 + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m7 ; cost[1a-d] + SWAP m0, m4 + + pshufd m0, [esp+0x00], q1032 + pshufd m1, [esp+0x10], q1032 + pshufd m2, [esp+0x20], q1032 + pshufd m3, m3, q1032 + mova [esp+0x00], m4 + + pslldq m4, m0, 6 + psrldq m0, 10 + pslldq m5, m1, 4 + psrldq m1, 12 + pslldq m6, m2, 2 + psrldq m2, 14 + paddw m4, m3 + paddw m0, m1 + paddw m5, m6 + paddw m0, m2 + paddw m4, m5 + pshuflw m2, m0, q3012 + punpckhwd m0, m4 + punpcklwd m4, m2 + pmaddwd m0, m0 + pmaddwd m4, m4 + MULLD m0, [base+div_table%+SUFFIX+48] + MULLD m4, [base+div_table%+SUFFIX+32] + paddd m4, m0 ; cost[3a-d] + + mova m1, [esp+0x00] + mova m2, [esp+0x50] + mova m0, [esp+0x30] ; cost[0,4,2,6] + phaddd m1, m4 + phaddd m2, [esp+0x40] ; cost[1,3,5,7] + phaddd m1, m2 + pshufd m2, m0, q3120 + + ; now find the best cost + %if cpuflag(sse4) + pmaxsd m0, m1 + pshufd m3, m0, q1032 + pmaxsd m3, m0 + pshufd m0, m3, q2301 + pmaxsd m0, m3 + %else + pcmpgtd m3, m0, m1 + pand m0, m3 + pandn m3, m1 + por m0, m3 + pshufd m4, m0, q1032 + pcmpgtd m3, m0, m4 + pand m0, m3 + pandn m3, m4 + por m0, m3 + pshufd m4, m0, q2301 + pcmpgtd m3, m0, m4 + pand m0, m3 + pandn m3, m4 + por m0, m3 + %endif + + ; get direction and variance + mov vard, varm + punpckhdq m3, m2, m1 + punpckldq m2, m1 + psubd m1, m0, m3 + psubd m4, m0, m2 + mova [esp+0x00], m1 ; emulate ymm in stack + mova [esp+0x10], m4 + pcmpeqd m3, m0 ; compute best cost mask + pcmpeqd m2, m0 + packssdw m2, m3 + pmovmskb eax, m2 ; get byte-idx from mask + tzcnt eax, eax + mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm + shr eax, 1 ; get direction by converting byte-idx to word-idx + shr r1d, 10 + mov [vard], r1d + %endif + + RET +%endmacro + +INIT_XMM sse4 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 +CDEF_DIR + +INIT_XMM ssse3 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 +CDEF_DIR + +INIT_XMM sse2 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 |