summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/cdef_avx2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/cdef_avx2.asm')
-rw-r--r--third_party/dav1d/src/x86/cdef_avx2.asm1772
1 files changed, 1772 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm
new file mode 100644
index 0000000000..1f30f8a3b7
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@@ -0,0 +1,1772 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+ dd %%base %+ .%2 - %%table
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1_8bpc, \
+ d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1
+%endmacro
+
+SECTION_RODATA 32
+
+pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
+blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+ dd 0x80, 0x00, 0x00
+blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ dd 0x00, 0x00
+blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000
+blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000, 0x0000
+blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
+div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+tap_table: ; masks for 8 bit shifts
+ db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ ; weights
+ db 4, 2, 3, 3, 2, 1
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
+SECTION .text
+
+%macro PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
+ lea dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, stride3, k
+ %else
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, dst4, stride3, k
+ lea dst4q, [dstq+strideq*4]
+ %endif
+%else
+ DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
+ table, dir, dirjmp, top2, stride3, k
+ mov hq, -8
+ lea top1q, [top1q+strideq*0]
+ lea top2q, [top1q+strideq*1]
+%endif
+%if %1 == 4
+ lea stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+ pxor m15, m15 ; sum
+%if %2 == 8
+ pxor m12, m12
+ %if %1 == 4
+ movd xm4, [dstq +strideq*0]
+ movd xm6, [dstq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm7, [dstq +stride3q ]
+ vinserti128 m4, [dst4q+strideq*0], 1
+ vinserti128 m6, [dst4q+strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*2], 1
+ vinserti128 m7, [dst4q+stride3q ], 1
+ punpckldq m4, m6
+ punpckldq m5, m7
+ %else
+ movq xm4, [dstq+strideq*0]
+ movq xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ %endif
+ punpcklqdq m4, m5
+%else
+ movd xm4, [dstq+strideq*0]
+ movd xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ punpckldq m4, m5
+%endif
+%if %3 == 1
+ mova m7, m4 ; min
+ mova m8, m4 ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, h, clip
+ ; load p0/p1
+ movsxd dirjmpq, [dirq+kq*4+%1*2*4]
+ add dirjmpq, tableq
+ call dirjmpq
+
+%if %8 == 1
+ pmaxub m7, m5
+ pminub m8, m5
+ pmaxub m7, m6
+ pminub m8, m6
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+ punpcklbw m5, m6
+ punpcklbw m6, m4, m4
+ psubusb m9, m5, m6
+ psubusb m5, m6, m5
+ por m9, m5 ; abs_diff_p01(p01 - px)
+ pcmpeqb m5, m9
+ por m5, %5
+ psignb m6, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m6
+ paddw m15, m5
+%else
+ psubusb m9, m5, m4
+ psubusb m5, m4, m5
+ psubusb m11, m6, m4
+ psubusb m6, m4, m6
+ por m9, m5 ; abs_diff_p0(p0 - px)
+ por m11, m6 ; abs_diff_p1(p1 - px)
+ pcmpeqb m5, m9
+ pcmpeqb m6, m11
+ punpckhbw m10, m9, m11
+ punpcklbw m9, m11
+ por m5, %5
+ por m11, m6, %5
+ punpckhbw m6, m5, m11
+ punpcklbw m5, m11
+ psignb m11, %5, m6
+ psrlw m6, m10, %2 ; emulate 8-bit shift
+ pand m6, %3
+ psubusb m6, %4, m6
+ pminub m6, m10
+ pmaddubsw m6, m11
+ paddw m12, m6
+ psignb m11, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m11
+ paddw m15, m5
+%endif
+%endmacro
+
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+ punpcklbw m4, %3
+ %endif
+ pcmpgtw %3, m15
+ paddw m15, %3
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m15
+ paddb m4, m15
+ %else
+ paddw m4, m15
+ packuswb m4, m4 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ pcmpgtw m6, %3, m12
+ pcmpgtw m5, %3, m15
+ paddw m12, m6
+ paddw m15, m5
+ %if %5 == 1
+ punpckhbw m5, m4, %3
+ punpcklbw m4, %3
+ %endif
+ pmulhrsw m12, %4
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m12
+ paddb m4, m15
+ %else
+ paddw m5, m12
+ paddw m4, m15
+ packuswb m4, m5 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ %if %1 == 4
+ movd [dstq +strideq*0], xm4
+ movd [dst4q+strideq*0], xm5
+ pextrd [dstq +strideq*1], xm4, 1
+ pextrd [dst4q+strideq*1], xm5, 1
+ pextrd [dstq +strideq*2], xm4, 2
+ pextrd [dst4q+strideq*2], xm5, 2
+ pextrd [dstq +stride3q ], xm4, 3
+ pextrd [dst4q+stride3q ], xm5, 3
+ %else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
+ %else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
+ %endif
+ mov hd, %1*%2*2/mmsize
+%else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
+%endif
+ lea stkq, [px]
+ pxor m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+%if %1 == 4
+ movq xm4, [stkq+32*0]
+ movhps xm4, [stkq+32*1]
+ movq xm5, [stkq+32*2]
+ movhps xm5, [stkq+32*3]
+ vinserti128 m4, xm5, 1
+%else
+ mova xm4, [stkq+32*0] ; px
+ vinserti128 m4, [stkq+32*1], 1
+%endif
+ pxor m15, m15 ; sum
+%if %3 == 1
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, clip
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1] ; off1
+%if %6 == 4
+ movq xm5, [stkq+offq*2+32*0] ; p0
+ movq xm6, [stkq+offq*2+32*2]
+ movhps xm5, [stkq+offq*2+32*1]
+ movhps xm6, [stkq+offq*2+32*3]
+ vinserti128 m5, xm6, 1
+%else
+ movu xm5, [stkq+offq*2+32*0] ; p0
+ vinserti128 m5, [stkq+offq*2+32*1], 1
+%endif
+ neg offq ; -off1
+%if %6 == 4
+ movq xm6, [stkq+offq*2+32*0] ; p1
+ movq xm9, [stkq+offq*2+32*2]
+ movhps xm6, [stkq+offq*2+32*1]
+ movhps xm9, [stkq+offq*2+32*3]
+ vinserti128 m6, xm9, 1
+%else
+ movu xm6, [stkq+offq*2+32*0] ; p1
+ vinserti128 m6, [stkq+offq*2+32*1], 1
+%endif
+%if %7 == 1
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5 ; max after p0
+ pminuw m8, m5 ; min after p0
+ pmaxsw m7, m6 ; max after p1
+ pminuw m8, m6 ; min after p1
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+ ; calculate difference before converting
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+
+ ; convert to 8-bits with signed saturation
+ ; saturating to large diffs has no impact on the results
+ packsswb m5, m6
+
+ ; group into pairs so we can accumulate using maddubsw
+ pshufb m5, m12
+ pabsb m9, m5
+ psignb m10, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+
+ ; use unsigned min since abs diff can equal 0x80
+ pminub m5, m9
+ pmaddubsw m5, m10
+ paddw m15, m5
+%endmacro
+
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+ pcmpgtw m9, m11, m15
+ paddw m15, m9
+ pmulhrsw m15, %2
+ paddw m4, m15
+%if %3 == 1
+ pminsw m4, m7
+ pmaxsw m4, m8
+%endif
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+%if %1 == 4
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+INIT_YMM avx2
+cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+ mov edged, edgem
+ cmp edged, 0xf
+ jne .border_block
+
+ PUSH r11
+ PUSH r12
+%if %2 == 4
+%assign regs_used 13
+ ALLOC_STACK 0x60, 16
+ pmovzxbw xm0, [leftq+1]
+ vpermq m0, m0, q0110
+ psrldq m1, m0, 4
+ vpalignr m2, m0, m0, 12
+ movu [rsp+0x10], m0
+ movu [rsp+0x28], m1
+ movu [rsp+0x40], m2
+%elif %1 == 4
+%assign regs_used 14
+ PUSH r13
+ ALLOC_STACK 8*2+%1*%2*1, 16
+ pmovzxwd m0, [leftq]
+ mova [rsp+0x10], m0
+%else
+%assign regs_used 15
+ PUSH r13
+ PUSH r14
+ ALLOC_STACK 8*4+%1*%2*2+32, 16
+ lea r11, [strideq*3]
+ movu xm4, [dstq+strideq*2]
+ pmovzxwq m0, [leftq+0]
+ pmovzxwq m1, [leftq+8]
+ vinserti128 m4, [dstq+r11], 1
+ pmovzxbd m2, [leftq+1]
+ pmovzxbd m3, [leftq+9]
+ mov [rsp+16], botq
+ mova [rsp+0x20], m0
+ mova [rsp+0x40], m1
+ mova [rsp+0x60], m2
+ mova [rsp+0x80], m3
+ mova [rsp+0xa0], m4
+ lea botq, [dstq+strideq*4]
+%endif
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ PREP_REGS %1, %2
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+ LOAD_BLOCK %1, %2, 1
+.k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
+ dec kq
+ jge .k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ pxor m9, m9
+ ADJUST_PIXEL %1, %2, m9, m10, 1
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .v_loop
+%endif
+ RET
+
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m3, [pw_2048]
+ pxor m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+ dec kq
+ jge .pri_k_loop
+ ADJUST_PIXEL %1, %2, m1, m3
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .pri_v_loop
+%endif
+ RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m2, [pw_2048]
+ pxor m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ dec kq
+ jge .sec_k_loop
+ ADJUST_PIXEL %1, %2, m0, m2
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .sec_v_loop
+%endif
+ RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-1]
+ vpbroadcastq m10, [dstq+strideq*2-1]
+ movd xm5, [topq+strideq*1+1]
+ movd xm9, [dstq+strideq*0+1]
+ psrldq m11, m6, 2
+ psrldq m12, m10, 2
+ vinserti128 m6, [dstq+stride3q -1], 1
+ vinserti128 m10, [botq -1], 1
+ vpblendd m5, m11, 0x10
+ vpblendd m9, m12, 0x10
+ movu m11, [blend_4x4+16]
+ punpckldq m6, m10
+ punpckldq m5, m9
+ vpblendvb m6, [rsp+gprsize+0x28], m11
+ %else
+ movd xm5, [topq +strideq*1+1]
+ movq xm6, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm5, [dstq +strideq*0+1], 1
+ movhps xm6, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm9, xm6, 2
+ shufps xm5, xm9, q2010 ; -1 +0 +1 +2
+ shufps xm6, xm10, q2020 ; +1 +2 +3 +4
+ psrldq xm9, xm11, 2
+ psrldq xm10, 2
+ shufps xm10, xm9, q2020 ; +3 +4 +5 +6
+ movd xm9, [dst4q+stride3q -1]
+ pinsrd xm9, [botq -1], 1
+ shufps xm11, xm9, q1020 ; +5 +6 +7 +8
+ pmovzxbw m9, [leftq+3]
+ vinserti128 m6, xm11, 1
+ movu m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [top2q +1]
+ vbroadcasti128 m10, [dstq+strideq*1-1]
+ vbroadcasti128 m11, [dstq+strideq*2-1]
+ movhps xm5, [dstq+strideq*0+1]
+ vinserti128 m6, m10, [dstq+stride3q-1], 1
+ vinserti128 m9, m11, [botq -1], 1
+ psrldq m10, 2
+ psrldq m11, 2
+ punpcklqdq m6, m9
+ movu m9, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m11
+ vpblendd m5, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
+%endif
+ ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+ movq xm6, [dstq+strideq*0-1]
+ movq xm9, [dstq+strideq*1-1]
+ vinserti128 m6, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m11, [rsp+gprsize+0x10]
+ pcmpeqd m12, m12
+ psrldq m5, m6, 2
+ psrldq m10, m9, 2
+ psrld m12, 24
+ punpckldq m6, m9
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movq xm6, [dstq +strideq*0-1]
+ movq xm9, [dstq +strideq*2-1]
+ movhps xm6, [dstq +strideq*1-1]
+ movhps xm9, [dstq +stride3q -1]
+ movq xm10, [dst4q+strideq*0-1]
+ movhps xm10, [dst4q+strideq*1-1]
+ psrldq xm5, xm6, 2
+ psrldq xm11, xm9, 2
+ shufps xm5, xm11, q2020
+ movq xm11, [dst4q+strideq*2-1]
+ movhps xm11, [dst4q+stride3q -1]
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, xm11, q2020
+ vinserti128 m6, xm9, 1
+ pmovzxbw m9, [leftq+1]
+ psrldq xm10, 2
+ psrldq xm11, 2
+ shufps xm10, xm11, q2020
+ vpbroadcastd m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ movu xm5, [dstq+strideq*0-1]
+ movu xm9, [dstq+strideq*1-1]
+ vinserti128 m5, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m10, [blend_8x8_0+16]
+ punpcklqdq m6, m5, m9
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m10, [dstq+strideq*1-1]
+ vpbroadcastq m11, [dstq+strideq*2-1]
+ movd xm6, [topq+strideq*1-1]
+ movd xm9, [dstq+strideq*0-1]
+ psrldq m5, m10, 2
+ psrldq m12, m11, 2
+ vpblendd m6, m10, 0x10
+ vpblendd m9, m11, 0x10
+ movu m10, [blend_4x4]
+ vinserti128 m5, [dstq+stride3q +1], 1
+ vinserti128 m12, [botq +1], 1
+ punpckldq m6, m9
+ punpckldq m5, m12
+ vpblendvb m6, [rsp+gprsize+0x40], m10
+ %else
+ movd xm6, [topq +strideq*1-1]
+ movq xm9, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm6, [dstq +strideq*0-1], 1
+ movhps xm9, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm5, xm9, 2
+ shufps xm6, xm9, q2010
+ psrldq xm9, xm10, 2
+ shufps xm5, xm9, q2020
+ shufps xm10, xm11, q2020
+ movd xm9, [dst4q+stride3q +1]
+ vinserti128 m6, xm10, 1
+ pinsrd xm9, [botq +1], 1
+ psrldq xm11, 2
+ pmovzxbw m10, [leftq-1]
+ shufps xm11, xm9, q1020
+ movu m9, [blend_4x8_0]
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, m10, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0+8]
+ movq xm6, [top2q -1]
+ vbroadcasti128 m5, [dstq+strideq*1-1]
+ vbroadcasti128 m9, [dstq+strideq*2-1]
+ movhps xm6, [dstq+strideq*0-1]
+ movu m11, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m5, m9
+ vinserti128 m5, [dstq+stride3q -1], 1
+ vinserti128 m9, [botq -1], 1
+ vpblendd m6, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*1 ]
+ vpbroadcastd m5, [dstq+strideq*1 ]
+ vpbroadcastd m9, [dstq+strideq*2 ]
+ vpblendd xm6, [dstq+strideq*0-4], 0x2
+ vpblendd m5, m9, 0x22
+ vpblendd m6, m5, 0x30
+ vinserti128 m5, [dstq+stride3q ], 1
+ vpblendd m5, [botq -20], 0x20
+ %else
+ movd xm6, [topq +strideq*1]
+ movd xm5, [dstq +strideq*1]
+ movd xm9, [dstq +stride3q ]
+ movd xm10, [dst4q+strideq*1]
+ movd xm11, [dst4q+stride3q ]
+ pinsrd xm6, [dstq +strideq*0], 1
+ pinsrd xm5, [dstq +strideq*2], 1
+ pinsrd xm9, [dst4q+strideq*0], 1
+ pinsrd xm10, [dst4q+strideq*2], 1
+ pinsrd xm11, [botq ], 1
+ punpcklqdq xm6, xm5
+ punpcklqdq xm5, xm9
+ punpcklqdq xm9, xm10
+ punpcklqdq xm10, xm11
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ %endif
+%else
+ movq xm6, [top2q ]
+ movq xm5, [dstq+strideq*1]
+ movq xm9, [dstq+stride3q ]
+ movhps xm6, [dstq+strideq*0]
+ movhps xm5, [dstq+strideq*2]
+ movhps xm9, [botq ]
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+%endif
+ ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [dstq+strideq*2-2]
+ movd xm9, [dstq+stride3q -2]
+ movd xm5, [topq+strideq*0+2]
+ movd xm10, [topq+strideq*1+2]
+ pinsrw xm6, [leftq+4], 0
+ pinsrw xm9, [leftq+6], 0
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movq xm6, [dstq +strideq*2-2]
+ movd xm10, [dst4q+strideq*2-2]
+ movd xm5, [topq +strideq*0+2]
+ movq xm9, [dst4q+strideq*0-2]
+ movhps xm6, [dstq +stride3q -2]
+ pinsrw xm10, [dst4q+stride3q ], 3
+ pinsrd xm5, [topq +strideq*1+2], 1
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [botq +strideq*0-2], 2
+ pinsrd xm5, [dstq +strideq*0+2], 2
+ pinsrd xm10, [botq +strideq*1-2], 3
+ pinsrd xm5, [dstq +strideq*1+2], 3
+ shufps xm11, xm6, xm9, q3131
+ shufps xm6, xm9, q2020
+ movu m9, [blend_4x8_3+8]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, [rsp+gprsize+0x10+8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm6, [dstq+strideq*2-2]
+ movq xm9, [dstq+stride3q -2]
+ movq xm5, [top1q +2]
+ movq xm10, [top2q +2]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
+%endif
+ ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-2]
+ vpbroadcastq m9, [dstq+strideq*2-2]
+ movd xm5, [topq+strideq*1+2]
+ movd xm10, [dstq+strideq*0+2]
+ psrldq m11, m6, 4
+ psrldq m12, m9, 4
+ vpblendd m5, m11, 0x10
+ movq xm11, [leftq+2]
+ vinserti128 m6, [dstq+stride3q-2], 1
+ punpckldq xm11, xm11
+ vpblendd m10, m12, 0x10
+ pcmpeqd m12, m12
+ pmovzxwd m11, xm11
+ psrld m12, 16
+ punpckldq m6, m9
+ vpbroadcastd m9, [botq-2]
+ vpblendvb m6, m11, m12
+ punpckldq m5, m10
+ vpblendd m6, m9, 0x20
+ %else
+ movd xm5, [topq +strideq*1+2]
+ movq xm6, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q -2]
+ pinsrd xm5, [dstq +strideq*0+2], 1
+ movhps xm6, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq -2], 1
+ shufps xm5, xm6, q3110
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, q3131
+ shufps xm10, xm11, q1020
+ movu m11, [blend_4x8_2+4]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm9, 1
+ vpblendvb m6, [rsp+gprsize+0x10+4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm5, [top2q +2]
+ vbroadcasti128 m6, [dstq+strideq*1-2]
+ vbroadcasti128 m9, [dstq+strideq*2-2]
+ movhps xm5, [dstq+strideq*0+2]
+ shufps m10, m6, m9, q2121
+ vinserti128 m6, [dstq+stride3q -2], 1
+ vinserti128 m9, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m5, m10, 0xF0
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
+%endif
+ ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+ movq xm11, [leftq]
+ movq xm6, [dstq+strideq*0-2]
+ movq xm9, [dstq+strideq*1-2]
+ vinserti128 m6, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ punpckldq xm11, xm11
+ psrldq m5, m6, 4
+ psrldq m10, m9, 4
+ pmovzxwd m11, xm11
+ punpckldq m6, m9
+ punpckldq m5, m10
+ pblendw m6, m11, 0x05
+ %else
+ movq xm5, [dstq +strideq*0-2]
+ movq xm9, [dstq +strideq*2-2]
+ movq xm10, [dst4q+strideq*0-2]
+ movq xm11, [dst4q+strideq*2-2]
+ movhps xm5, [dstq +strideq*1-2]
+ movhps xm9, [dstq +stride3q -2]
+ movhps xm10, [dst4q+strideq*1-2]
+ movhps xm11, [dst4q+stride3q -2]
+ shufps xm6, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, xm11, q2020
+ shufps xm10, xm11, q3131
+ pmovzxwd m11, [leftq]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ pblendw m6, m11, 0x55
+ %endif
+%else
+ mova m11, [rsp+gprsize+0x20+hq*8+64]
+ movu xm5, [dstq+strideq*0-2]
+ movu xm9, [dstq+strideq*1-2]
+ vinserti128 m5, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ shufps m6, m5, m9, q1010
+ shufps m5, m9, q2121
+ pblendw m6, m11, 0x11
+%endif
+ ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m11, [dstq+strideq*1-2]
+ vpbroadcastq m12, [dstq+strideq*2-2]
+ movd xm6, [topq+strideq*1-2]
+ movd xm9, [dstq+strideq*0-2]
+ pblendw m11, [leftq-16+2], 0x01
+ pblendw m12, [leftq-16+4], 0x01
+ pinsrw xm9, [leftq- 0+0], 0
+ psrldq m5, m11, 4
+ psrldq m10, m12, 4
+ vinserti128 m5, [dstq+stride3q +2], 1
+ vinserti128 m10, [botq +2], 1
+ vpblendd m6, m11, 0x10
+ vpblendd m9, m12, 0x10
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm6, [topq +strideq*1-2]
+ movq xm5, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q +2]
+ pinsrw xm6, [dstq +strideq*0 ], 3
+ movhps xm5, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq +2], 1
+ shufps xm6, xm5, q2010
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, q2020
+ shufps xm10, xm11, q1031
+ movu m11, [blend_4x8_2]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+8]
+ movq xm6, [top2q -2]
+ vbroadcasti128 m5, [dstq+strideq*1-2]
+ vbroadcasti128 m10, [dstq+strideq*2-2]
+ movhps xm6, [dstq+strideq*0-2]
+ punpcklqdq m9, m5, m10
+ vinserti128 m5, [dstq+stride3q -2], 1
+ vinserti128 m10, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m6, m9, 0xF0
+ shufps m5, m10, q2121
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
+%endif
+ ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ movd xm5, [dstq+strideq*2+2]
+ movd xm10, [dstq+stride3q +2]
+ pblendw m6, [leftq-16+0], 0x01
+ pblendw m9, [leftq-16+2], 0x01
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ vpblendd m6, [topq+strideq*0-2], 0x01
+ vpblendd m9, [topq+strideq*1-2], 0x01
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movd xm6, [topq +strideq*0-2]
+ movq xm5, [dstq +strideq*2-2]
+ movq xm9, [dst4q+strideq*0-2]
+ movd xm10, [dst4q+strideq*2+2]
+ pinsrd xm6, [topq +strideq*1-2], 1
+ movhps xm5, [dstq +stride3q -2]
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [dst4q+stride3q +2], 1
+ pinsrd xm6, [dstq +strideq*0-2], 2
+ pinsrd xm10, [botq +strideq*0+2], 2
+ pinsrd xm6, [dstq +strideq*1-2], 3
+ pinsrd xm10, [botq +strideq*1+2], 3
+ shufps xm11, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ movu m9, [blend_4x8_3]
+ vinserti128 m6, xm11, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -2]
+ movq xm9, [top2q -2]
+ movq xm5, [dstq+strideq*2+2]
+ movq xm10, [dstq+stride3q +2]
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
+ punpcklqdq m5, m10
+%endif
+ ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0-1]
+ movd xm9, [topq+strideq*1-1]
+ movd xm5, [dstq+strideq*2+1]
+ movd xm10, [dstq+stride3q +1]
+ pcmpeqd m12, m12
+ pmovzxbw m11, [leftq-8+1]
+ psrld m12, 24
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpckldq m6, m9
+ pxor m9, m9
+ vpblendd m12, m9, 0x0F
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movd xm6, [topq +strideq*0-1]
+ movq xm5, [dstq +strideq*2-1]
+ movq xm9, [dst4q+strideq*0-1]
+ movd xm10, [dst4q+strideq*2+1]
+ pinsrd xm6, [topq +strideq*1-1], 1
+ movhps xm5, [dstq +stride3q -1]
+ movhps xm9, [dst4q+strideq*1-1]
+ pinsrd xm10, [dst4q+stride3q +1], 1
+ pinsrd xm6, [dstq +strideq*0-1], 2
+ pinsrd xm10, [botq +strideq*0+1], 2
+ pinsrd xm6, [dstq +strideq*1-1], 3
+ pinsrd xm10, [botq +strideq*1+1], 3
+ shufps xm11, xm5, xm9, q2020
+ vinserti128 m6, xm11, 1
+ pmovzxbw m11, [leftq-3]
+ psrldq xm5, 2
+ psrldq xm9, 2
+ shufps xm5, xm9, q2020
+ movu m9, [blend_4x8_1]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m11, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -1]
+ movq xm9, [top2q -1]
+ movq xm5, [dstq+strideq*2+1]
+ movq xm10, [dstq+stride3q +1]
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
+%endif
+ ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0]
+ movd xm9, [topq+strideq*1]
+ movd xm5, [dstq+strideq*2]
+ movd xm10, [dstq+stride3q ]
+ vinserti128 m6, [dstq+strideq*0], 1
+ vinserti128 m9, [dstq+strideq*1], 1
+ vinserti128 m5, [botq+strideq*0], 1
+ vinserti128 m10, [botq+strideq*1], 1
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm5, [dstq +strideq*2]
+ movd xm6, [topq +strideq*0]
+ movd xm9, [dst4q+strideq*2]
+ pinsrd xm5, [dstq +stride3q ], 1
+ pinsrd xm6, [topq +strideq*1], 1
+ pinsrd xm9, [dst4q+stride3q ], 1
+ pinsrd xm5, [dst4q+strideq*0], 2
+ pinsrd xm6, [dstq +strideq*0], 2
+ pinsrd xm9, [botq +strideq*0], 2
+ pinsrd xm5, [dst4q+strideq*1], 3
+ pinsrd xm6, [dstq +strideq*1], 3
+ pinsrd xm9, [botq +strideq*1], 3
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+ %endif
+%else
+ movq xm5, [dstq+strideq*2]
+ movq xm9, [botq+strideq*0]
+ movq xm6, [top1q ]
+ movq xm10, [dstq+strideq*0]
+ movhps xm5, [dstq+stride3q ]
+ movhps xm9, [botq+strideq*1]
+ movhps xm6, [top2q ]
+ movhps xm10, [dstq+strideq*1]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+%endif
+ ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm5, [dstq+strideq*2-1]
+ movd xm9, [dstq+stride3q -1]
+ movd xm6, [topq+strideq*0+1]
+ movd xm10, [topq+strideq*1+1]
+ pinsrb xm5, [leftq+ 5], 0
+ pinsrb xm9, [leftq+ 7], 0
+ vinserti128 m6, [dstq+strideq*0+1], 1
+ vinserti128 m10, [dstq+strideq*1+1], 1
+ vinserti128 m5, [botq+strideq*0-1], 1
+ vinserti128 m9, [botq+strideq*1-1], 1
+ punpckldq m6, m10
+ punpckldq m5, m9
+ %else
+ movd xm6, [topq +strideq*0+1]
+ movq xm9, [dstq +strideq*2-1]
+ movq xm10, [dst4q+strideq*0-1]
+ movd xm11, [dst4q+strideq*2-1]
+ pinsrd xm6, [topq +strideq*1+1], 1
+ movhps xm9, [dstq +stride3q -1]
+ movhps xm10, [dst4q+strideq*1-1]
+ pinsrd xm11, [dst4q+stride3q -1], 1
+ pinsrd xm6, [dstq +strideq*0+1], 2
+ pinsrd xm11, [botq +strideq*0-1], 2
+ pinsrd xm6, [dstq +strideq*1+1], 3
+ pinsrd xm11, [botq +strideq*1-1], 3
+ shufps xm5, xm9, xm10, q2020
+ vinserti128 m5, xm11, 1
+ pmovzxbw m11, [leftq+5]
+ psrldq xm9, 2
+ psrldq xm10, 2
+ shufps xm9, xm10, q2020
+ movu m10, [blend_4x8_1+8]
+ vinserti128 m6, xm9, 1
+ vpblendvb m5, m11, m10
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [dstq+strideq*2-1]
+ movq xm9, [botq+strideq*0-1]
+ movq xm6, [top1q +1]
+ movq xm10, [dstq+strideq*0+1]
+ movhps xm5, [dstq+stride3q -1]
+ movhps xm9, [botq+strideq*1-1]
+ movhps xm6, [top2q +1]
+ movhps xm10, [dstq+strideq*1+1]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+ vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
+%endif
+ ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%assign regs_used 11
+ ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
+ pcmpeqw m14, m14
+ psllw m14, 15 ; 0x8000
+
+ ; prepare pixel buffers - body/right
+%if %1 == 4
+ INIT_XMM avx2
+%endif
+%if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+%endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ pmovzxbw m1, [dstq+strideq*0]
+ pmovzxbw m2, [dstq+strideq*1]
+ pmovzxbw m3, [dstq+strideq*2]
+ pmovzxbw m4, [dstq+stride3q]
+ mova [px+0*32], m1
+ mova [px+1*32], m2
+ mova [px+2*32], m3
+ mova [px+3*32], m4
+%if %2 == 8
+ pmovzxbw m1, [dst4q+strideq*0]
+ pmovzxbw m2, [dst4q+strideq*1]
+ pmovzxbw m3, [dst4q+strideq*2]
+ pmovzxbw m4, [dst4q+stride3q]
+ mova [px+4*32], m1
+ mova [px+5*32], m2
+ mova [px+6*32], m3
+ mova [px+7*32], m4
+%endif
+ jmp .body_done
+.no_right:
+%if %1 == 4
+ movd xm1, [dstq+strideq*0]
+ movd xm2, [dstq+strideq*1]
+ movd xm3, [dstq+strideq*2]
+ movd xm4, [dstq+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+0*32], xm1
+ movq [px+1*32], xm2
+ movq [px+2*32], xm3
+ movq [px+3*32], xm4
+%else
+ pmovzxbw xm1, [dstq+strideq*0]
+ pmovzxbw xm2, [dstq+strideq*1]
+ pmovzxbw xm3, [dstq+strideq*2]
+ pmovzxbw xm4, [dstq+stride3q]
+ mova [px+0*32], xm1
+ mova [px+1*32], xm2
+ mova [px+2*32], xm3
+ mova [px+3*32], xm4
+%endif
+ movd [px+0*32+%1*2], xm14
+ movd [px+1*32+%1*2], xm14
+ movd [px+2*32+%1*2], xm14
+ movd [px+3*32+%1*2], xm14
+%if %2 == 8
+ %if %1 == 4
+ movd xm1, [dst4q+strideq*0]
+ movd xm2, [dst4q+strideq*1]
+ movd xm3, [dst4q+strideq*2]
+ movd xm4, [dst4q+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+4*32], xm1
+ movq [px+5*32], xm2
+ movq [px+6*32], xm3
+ movq [px+7*32], xm4
+ %else
+ pmovzxbw xm1, [dst4q+strideq*0]
+ pmovzxbw xm2, [dst4q+strideq*1]
+ pmovzxbw xm3, [dst4q+strideq*2]
+ pmovzxbw xm4, [dst4q+stride3q]
+ mova [px+4*32], xm1
+ mova [px+5*32], xm2
+ mova [px+6*32], xm3
+ mova [px+7*32], xm4
+ %endif
+ movd [px+4*32+%1*2], xm14
+ movd [px+5*32+%1*2], xm14
+ movd [px+6*32+%1*2], xm14
+ movd [px+7*32+%1*2], xm14
+%endif
+.body_done:
+
+ ; top
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ pmovzxbw m1, [topq+strideq*0-(%1/2)]
+ pmovzxbw m2, [topq+strideq*1-(%1/2)]
+ movu [px-2*32-%1], m1
+ movu [px-1*32-%1], m2
+ jmp .top_done
+.top_no_right:
+ pmovzxbw m1, [topq+strideq*0-%1]
+ pmovzxbw m2, [topq+strideq*1-%1]
+ movu [px-2*32-%1*2], m1
+ movu [px-1*32-%1*2], m2
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ pmovzxbw m1, [topq+strideq*0]
+ pmovzxbw m2, [topq+strideq*1]
+ mova [px-2*32+0], m1
+ mova [px-1*32+0], m2
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ jmp .top_done
+.top_no_left_right:
+%if %1 == 4
+ movd xm1, [topq+strideq*0]
+ pinsrd xm1, [topq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px-2*32+0], xm1
+ movhps [px-1*32+0], xm1
+%else
+ pmovzxbw xm1, [topq+strideq*0]
+ pmovzxbw xm2, [topq+strideq*1]
+ mova [px-2*32+0], xm1
+ mova [px-1*32+0], xm2
+%endif
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.no_top:
+ movu [px-2*32-%1], m14
+ movu [px-1*32-%1], m14
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ pmovzxbw xm1, [leftq+ 0]
+%if %2 == 8
+ pmovzxbw xm2, [leftq+ 8]
+%endif
+ movd [px+0*32-4], xm1
+ pextrd [px+1*32-4], xm1, 1
+ pextrd [px+2*32-4], xm1, 2
+ pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
+ movd [px+4*32-4], xm2
+ pextrd [px+5*32-4], xm2, 1
+ pextrd [px+6*32-4], xm2, 2
+ pextrd [px+7*32-4], xm2, 3
+%endif
+ jmp .left_done
+.no_left:
+ movd [px+0*32-4], xm14
+ movd [px+1*32-4], xm14
+ movd [px+2*32-4], xm14
+ movd [px+3*32-4], xm14
+%if %2 == 8
+ movd [px+4*32-4], xm14
+ movd [px+5*32-4], xm14
+ movd [px+6*32-4], xm14
+ movd [px+7*32-4], xm14
+%endif
+.left_done:
+
+ ; bottom
+ DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ pmovzxbw m1, [botq+strideq*0-(%1/2)]
+ pmovzxbw m2, [botq+strideq*1-(%1/2)]
+ movu [px+(%2+0)*32-%1], m1
+ movu [px+(%2+1)*32-%1], m2
+ jmp .bottom_done
+.bottom_no_right:
+ pmovzxbw m1, [botq+strideq*0-%1]
+ pmovzxbw m2, [botq+strideq*1-%1]
+ movu [px+(%2+0)*32-%1*2], m1
+ movu [px+(%2+1)*32-%1*2], m2
+%if %1 == 8
+ movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
+%endif
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ pmovzxbw m1, [botq+strideq*0]
+ pmovzxbw m2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], m1
+ mova [px+(%2+1)*32+0], m2
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ jmp .bottom_done
+.bottom_no_left_right:
+%if %1 == 4
+ movd xm1, [botq+strideq*0]
+ pinsrd xm1, [botq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px+(%2+0)*32+0], xm1
+ movhps [px+(%2+1)*32+0], xm1
+%else
+ pmovzxbw xm1, [botq+strideq*0]
+ pmovzxbw xm2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], xm1
+ mova [px+(%2+1)*32+0], xm2
+%endif
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.no_bottom:
+ movu [px+(%2+0)*32-%1], m14
+ movu [px+(%2+1)*32-%1], m14
+.bottom_done:
+
+ ; actual filter
+ INIT_YMM avx2
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
+%undef edged
+ ; register to shuffle values into after packing
+ vbroadcasti128 m12, [shufb_lohi]
+
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .border_sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .border_pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ BORDER_PREP_REGS %1, %2
+%if %1*%2*2/mmsize > 1
+.border_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2, 1
+.border_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
+ dec kq
+ jge .border_k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_v_loop
+%endif
+ RET
+
+.border_pri_only:
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m1, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+ dec kq
+ jge .border_pri_k_loop
+ BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_pri_v_loop
+%endif
+ RET
+
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+ dec kq
+ jge .border_sec_k_loop
+ BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_sec_v_loop
+%endif
+ RET
+%endmacro
+
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+
+INIT_YMM avx2
+cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
+ lea stride3q, [strideq*3]
+ movq xm0, [srcq+strideq*0]
+ movq xm1, [srcq+strideq*1]
+ movq xm2, [srcq+strideq*2]
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+stride3q ]
+ vpbroadcastq m5, [srcq+strideq*2]
+ vpblendd m0, m4, 0xf0
+ vpblendd m1, m5, 0xf0
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m5, [srcq+strideq*0]
+ vpblendd m2, m4, 0xf0
+ vpblendd m3, m5, 0xf0
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+cglobal_label .main
+ vpbroadcastd m4, [pw_128]
+ PROLOGUE 3, 4, 15
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ ; shuffle registers to generate partial_sum_diag[0-1] together
+ vperm2i128 m7, m0, m0, 0x01
+ vperm2i128 m6, m1, m1, 0x01
+ vperm2i128 m5, m2, m2, 0x01
+ vperm2i128 m4, m3, m3, 0x01
+
+ ; start with partial_sum_hv[0-1]
+ paddw m8, m0, m1
+ paddw m9, m2, m3
+ phaddw m10, m0, m1
+ phaddw m11, m2, m3
+ paddw m8, m9
+ phaddw m10, m11
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ paddw xm8, xm9 ; partial_sum_hv[1]
+ phaddw xm10, xm11 ; partial_sum_hv[0]
+ vinserti128 m8, xm10, 1
+ vpbroadcastd m9, [div_table+44]
+ pmaddwd m8, m8
+ pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
+
+ ; create aggregates [lower half]:
+ ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+ ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+ ; and [upper half]:
+ ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+ ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+ ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
+ vbroadcasti128 m14, [shufw_6543210x]
+ vbroadcasti128 m13, [div_table+16]
+ vbroadcasti128 m12, [div_table+0]
+ paddw m9, m0 ; partial_sum_diag[0/1][0-7]
+ pshufb m10, m14
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ pmulld m11, m13
+ pmulld m9, m12
+ paddd m9, m11 ; cost0[a-d] | cost4[a-d]
+
+ ; merge horizontally and vertically for partial_sum_alt[0-3]
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; create aggregates [lower half]:
+ ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+ ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+ ; and [upper half]:
+ ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+ ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m4, m11, 2
+ psrldq m11, 14
+ pslldq m5, m12, 4
+ psrldq m12, 12
+ pslldq m6, m13, 6
+ psrldq m13, 10
+ paddw m4, m10
+ paddw m11, m12
+ vpbroadcastd m12, [div_table+44]
+ paddw m5, m6
+ paddw m11, m13 ; partial_sum_alt[3/2] right
+ vbroadcasti128 m13, [div_table+32]
+ paddw m4, m5 ; partial_sum_alt[3/2] left
+ pshuflw m5, m11, q3012
+ punpckhwd m6, m11, m4
+ punpcklwd m4, m5
+ pmaddwd m6, m6
+ pmaddwd m4, m4
+ pmulld m6, m12
+ pmulld m4, m13
+ paddd m4, m6 ; cost7[a-d] | cost5[a-d]
+
+ ; create aggregates [lower half]:
+ ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+ ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+ ; and [upper half]:
+ ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+ ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m5, m1, 2
+ psrldq m1, 14
+ pslldq m6, m2, 4
+ psrldq m2, 12
+ pslldq m7, m3, 6
+ psrldq m3, 10
+ paddw m5, m0
+ paddw m1, m2
+ paddw m6, m7
+ paddw m1, m3 ; partial_sum_alt[0/1] right
+ paddw m5, m6 ; partial_sum_alt[0/1] left
+ pshuflw m0, m1, q3012
+ punpckhwd m1, m5
+ punpcklwd m5, m0
+ pmaddwd m1, m1
+ pmaddwd m5, m5
+ pmulld m1, m12
+ pmulld m5, m13
+ paddd m5, m1 ; cost1[a-d] | cost3[a-d]
+
+ mova xm0, [pd_47130256+ 16]
+ mova m1, [pd_47130256]
+ phaddd m9, m8
+ phaddd m5, m4
+ phaddd m9, m5
+ vpermd m0, m9 ; cost[0-3]
+ vpermd m1, m9 ; cost[4-7] | cost[0-3]
+
+ ; now find the best cost
+ pmaxsd xm2, xm0, xm1
+ pshufd xm3, xm2, q1032
+ pmaxsd xm2, xm3
+ pshufd xm3, xm2, q2301
+ pmaxsd xm2, xm3 ; best cost
+
+ ; find the idx using minpos
+ ; make everything other than the best cost negative via subtraction
+ ; find the min of unsigned 16-bit ints to sort out the negative values
+ psubd xm4, xm1, xm2
+ psubd xm3, xm0, xm2
+ packssdw xm3, xm4
+ phminposuw xm3, xm3
+
+ ; convert idx to 32-bits
+ psrld xm3, 16
+ movd eax, xm3
+
+ ; get idx^4 complement
+ vpermd m3, m1
+ psubd xm2, xm3
+ psrld xm2, 10
+ movd [varq], xm2
+ RET
+
+%endif ; ARCH_X86_64