diff options
Diffstat (limited to 'third_party/dav1d/src/x86/cdef16_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/cdef16_avx512.asm | 622 |
1 files changed, 622 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/cdef16_avx512.asm b/third_party/dav1d/src/x86/cdef16_avx512.asm new file mode 100644 index 0000000000..6d625a02a0 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef16_avx512.asm @@ -0,0 +1,622 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 + db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 + db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 + db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 +end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 +edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 + dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 + dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 +pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 +cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 + dw 1, 2, 1, 10, 9, 18, 8, 17 + dw 8, 16, 8, 15, -7,-14, 1, -6 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 + db 2, 4, 2, 36, 34, 68, 32, 66 + db 32, 64, 32, 62,-30,-60, 2,-28 +pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 +sec_taps4: dw 32, 16 +pw_m16384: times 2 dw -16384 +pw_2048: times 2 dw 2048 +pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) +edge_mask8: dw 0x2121, 0x2020, 0x0101 + +SECTION .text + +%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp + psubw %1, %2, %3 + pabsw %1, %1 + vpcmpgtw k1, %3, %2 + vpsrlvw %7, %1, %6 + psubusw %7, %5, %7 + pminsw %1, %7 + vpsubw %1{k1}, %4, %1 +%endmacro + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs4 + lea r6, [cdef_dirs4] + movu xm3, [dstq+strideq*0] + vinserti32x4 ym3, [dstq+strideq*1], 1 + mova xm2, [leftq] + lea r2, [dstq+strideq*2] + vinserti32x4 m3, [r2+strideq*0], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m3, [r2+strideq*1], 3 + vpermt2d m2, m5, m3 + vinserti32x4 m1, m2, [topq+strideq*0-4], 0 + vinserti32x4 m1, [topq+strideq*1-4], 1 + mov r3d, edgem + movifnidn prid, prim + punpcklwd m3, m3 ; px + psrlw m5, 8 + vpbroadcastd m0, [base+pd_268435568] + pxor m12, m12 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m6, m3, m8 + pmaxsw m7, m3, m8 + pminuw m6, m9 + pmaxsw m7, m9 + call .constrain_sec + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + psrldq m8, m6, 2 + vpshldd m3, m0, 8 + psrldq m9, m7, 2 + paddd m0, m3 + pminuw m6, m8 + psrldq m0, 1 + pmaxsw m7, m9 + pmaxsw m0, m6 + pminsw m0, m7 + vpmovdw ym0, m0 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + mov r4d, dirm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym1, [base+end_perm4] + vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + vpermb m0, m1, m0 +.end: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm0, ym0, 1 + movq [r2+strideq*0], xm0 + movhps [r2+strideq*1], xm0 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + or r3d, 0x04 + vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m8, m5, m9 + vpermi2w m8, m1, m2 ; k0p0 k1p0 + psubw m9, m5, m9 + vpermi2w m9, m1, m2 ; k0p1 k1p1 + CONSTRAIN m10, m8, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + CONSTRAIN m10, m9, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + ret + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 + +cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge + lea r6, [cdef_dirs4] + movu xm18, [dstq+strideq*0] + vinserti128 ym18, [dstq+strideq*1], 1 + mova xm1, [leftq+16*0] + mova xm2, [leftq+16*1] + lea r2, [strideq*3] + vinserti32x4 m18, [dstq+strideq*2], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m18, [dstq+r2 ], 3 + vpermt2d m1, m5, m18 + vinserti32x4 m0, m1, [topq+strideq*0-4], 0 + vinserti32x4 m0, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu xm19, [r3+strideq*0] + vinserti128 ym19, [r3+strideq*1], 1 + vinserti32x4 m19, [r3+strideq*2], 2 + vinserti32x4 m19, [r3+r2 ], 3 + mov r3d, edgem + movifnidn prid, prim + vpermt2d m2, m5, m19 + vpbroadcastd m16, [base+pd_268435568] + pxor m12, m12 + punpcklwd m18, m18 ; px (top) + psrlw m5, 8 + punpcklwd m19, m19 ; px (bottom) + mova m17, m16 + vshufi32x4 m1, m2, q3210 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m3, m18, m6 + pmaxsw m4, m18, m6 + pminuw m20, m19, m7 + pmaxsw m21, m19, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + call .constrain_sec + pminuw m3, m6 + pmaxsw m4, m6 + pminuw m20, m7 + pmaxsw m21, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m3, m6 + pmaxsw m4, m6 + mov r3, 0xcccccccccccccccc + pminuw m20, m7 + pmaxsw m21, m7 + kmovq k1, r3 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vbroadcasti32x4 m0, [base+deint_shuf] + vpshldd m6, m20, m3, 16 + vmovdqu8 m3{k1}, m20 + vpshldd m18, m16, 8 + vpshldd m7, m21, m4, 16 + vmovdqu8 m4{k1}, m21 + vpshldd m19, m17, 8 + pminuw m3, m6 + paddd m16, m18 + pmaxsw m4, m7 + paddd m17, m19 + psrldq m16, 1 + palignr m16{k1}, m17, m17, 15 + lea r6, [dstq+strideq*4] + pmaxsw m16, m3 + pminsw m16, m4 + pshufb m16, m0 + movq [dstq+strideq*0], xm16 + movhps [r6 +strideq*0], xm16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*1], xm17 + movhps [r6 +strideq*1], xm17 + vextracti32x4 xm17, m16, 2 + movq [dstq+strideq*2], xm17 + movhps [r6 +strideq*2], xm17 + vextracti32x4 xm16, m16, 3 + movq [dstq+r2 ], xm16 + movhps [r6 +r2 ], xm16 + RET +.sec_only: + mov r4d, dirm + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym20, [base+end_perm4] + vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m19, m17, 8 + paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddd m17, m19 + vpermb m16, m20, m16 + vpermb m17, m20, m17 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + vextracti128 xm16, ym16, 1 + movq [dstq+strideq*2], xm16 + movhps [dstq+r2 ], xm16 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm17 + movhps [dstq+strideq*1], xm17 + vextracti128 xm17, ym17, 1 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + mov r4d, r3d + or r3d, 0x0c + vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + or r4d, 0x04 + vmovdqa32 m1{k1}, m6 + kmovw k1, [base+edge_mask4-8+r4*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m7, m5, m9 + mova m6, m0 + vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) + psubw m9, m5, m9 + mova m8, m0 + vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) + CONSTRAIN m10, m6, m18, m12, m13, m14, m11 + vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m7, m19, m12, m13, m14, m11 + vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) + vpdpwssd m17, m10, m15 + CONSTRAIN m10, m8, m18, m12, m13, m14, m11 + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m9, m19, m12, m13, m14, m11 + vpdpwssd m17, m10, m15 + ret + +cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs8 + lea r6, [cdef_dirs8] + movu ym17, [dstq+strideq*0] + vinserti32x8 m17, [dstq+strideq*1], 1 + movq xm4, [leftq+8*0] + movq xm5, [leftq+8*1] + psrld m2, [base+cdef_perm], 16 + movq xm6, [leftq+8*2] + movq xm7, [leftq+8*3] + lea r2, [strideq*3] + movu ym16, [topq+strideq*0-4] + vinserti32x8 m16, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu ym18, [dstq+strideq*2] + vinserti32x8 m18, [dstq+r2 ], 1 + movu ym19, [r3+strideq*0] + vinserti32x8 m19, [r3+strideq*1], 1 + movu ym20, [r3+strideq*2] + vinserti32x8 m20, [r3+r2 ], 1 + vshufi32x4 m0, m17, m18, q2020 ; px (top) + mov r3d, edgem + vshufi32x4 m1, m19, m20, q2020 ; px (bottom) + movifnidn prid, prim + vpermt2d m17, m2, m4 + vpermt2d m18, m2, m5 + pxor m12, m12 + vpermt2d m19, m2, m6 + vpermt2d m20, m2, m7 + cmp r3d, 0x0f + jne .mask_edges + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 +.main: + mova [rsp+64*0], m16 ; top + mova [rsp+64*1], m17 ; 0 1 + mova [rsp+64*2], m18 ; 2 3 + mova [rsp+64*3], m19 ; 4 5 + mova [rsp+64*4], m20 ; 6 7 + mova [rsp+64*5], m21 ; bottom + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + add r4d, r3d ; pri_shift + vpbroadcastw m14, r4d + mov r4d, dirm + vpbroadcastd m2, [base+pri_taps8+priq*2+0] + vpbroadcastd m3, [base+pri_taps8+priq*2+4] + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 + pmaxsw m14, m12 + call .constrain + mov r5d, secm + pmullw m16, m8, m2 + pmullw m17, m9, m2 + test r5d, r5d + jnz .pri_sec + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + call .constrain + pmullw m8, m3 + pmullw m9, m3 + jmp .end_no_clip +.pri_sec: + lzcnt r5d, r5d + add r3d, r5d ; sec_shift + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + pminuw m18, m0, m4 + pmaxsw m19, m0, m4 + pminuw m20, m1, m5 + pmaxsw m21, m1, m5 + call .min_max_constrain2 + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 + pmullw m8, m3 + pmullw m9, m3 + vpbroadcastw m13, secm + vpbroadcastw m14, r3d + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 + mova m2, m8 + mova m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 + paddw m2, m8 + paddw m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 + paddw m2, m2 + paddw m3, m3 + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + vpbroadcastd m10, [base+pw_2048] + paddw m16, m2 + paddw m17, m3 + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 + paddw m16, m0 + paddw m17, m1 + pmaxsw m16, m18 + pmaxsw m17, m20 + pminsw m16, m19 + pminsw m17, m21 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r4d, dirm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] + vpbroadcastw m14, r3d + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] + mova m16, m8 + mova m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] + paddw m16, m8 + paddw m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] + paddw m16, m16 + paddw m17, m17 + paddw m16, m8 + paddw m17, m9 + call .constrain +.end_no_clip: + vpbroadcastd m10, [base+pw_2048] + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + paddw m16, m0 + paddw m17, m1 +.end: + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm17 + vextracti128 [dstq+strideq*1], ym17, 1 + vextracti32x4 [dstq+strideq*2], m17, 2 + vextracti32x4 [dstq+r2 ], m17, 3 + RET +.mask_edges: + vpbroadcastd m2, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 + jmp .mask_edges_top +.mask_edges_no_bottom: + mova m21, m2 +.mask_edges_top: + test r3b, 0x04 + jnz .mask_edges_main + mova m16, m2 +.mask_edges_main: + and r3d, 0x03 + cmp r3d, 0x03 + je .main + kmovw k1, [base+edge_mask8+r3*2] + vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 + vmovdqa32 m17{k1}, m2 + vmovdqa32 m18{k1}, m2 + vmovdqa32 m19{k1}, m2 + vmovdqa32 m20{k1}, m2 + vmovdqa32 m21{k1}, m2 + jmp .main +ALIGN function_align +.min_max_constrain: + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 +.min_max_constrain2: + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 +.constrain: + %define tmp rsp+gprsize+68 + movu m4, [tmp+r5+64*0] + vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) + movu m5, [tmp+r5+64*2] + vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) + neg r5 + movu m6, [tmp+r5+64*0] + vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) + movu m7, [tmp+r5+64*2] + vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) + CONSTRAIN m8, m4, m0, m12, m13, m14, m15 + CONSTRAIN m9, m5, m1, m12, m13, m14, m15 + CONSTRAIN m10, m6, m0, m12, m13, m14, m15 + CONSTRAIN m11, m7, m1, m12, m13, m14, m15 + paddw m8, m10 + paddw m9, m11 + ret + +%endif ; ARCH_X86_64 |