; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 pb_27_17: times 2 db 27, 17 pb_23_22: times 2 db 23, 22 pw_8: times 2 dw 8 pw_1024: times 2 dw 1024 pb_17_27: times 2 db 17, 27 fg_max: times 4 db 255 times 4 db 240 times 4 db 235 fg_min: times 4 db 0 times 4 db 16 noise_rnd: times 2 dw 128 times 2 dw 64 times 2 dw 32 times 2 dw 16 SECTION .text INIT_ZMM avx512icl cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, see, overlap %define base r11-fg_min lea r11, [fg_min] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] mov r12, 0x0000000f0000000f ; h_overlap mask mova m0, [scalingq+64*0] mova m1, [scalingq+64*1] mova m2, [scalingq+64*2] mova m3, [scalingq+64*3] kmovq k1, r12 vbroadcasti32x4 m4, [base+interleave_hl] vpbroadcastd ym16, [base+pb_27_17] vpbroadcastd m12, [base+pb_17_27] vpbroadcastd m6, [base+noise_rnd+r6*4-32] test sbyd, sbyd setnz r6b vpbroadcastd m7, [base+fg_min+r7*4] vpbroadcastd m8, [base+fg_max+r7*8] pxor m5, m5 vpbroadcastd m9, [base+pw_1024] vpbroadcastq m10, [base+pb_27_17_17_27] vmovdqa64 m12{k1}, m16 test r6b, overlapb jnz .v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, overlap mov grain_lutq, grain_lutmp mov hd, hm .loop_y: movu ym21, [grain_lutq+offxyq-82] vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 call .add_noise sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x test sbyd, sbyd jnz .hv_overlap .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, left_offxy rorx offyd, seed, 8 mov left_offxyd, offxd ; previous column's offy*stride rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: movu ym20, [grain_lutq+offxyq-82] vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 movd xm19, [grain_lutq+left_offxyq-50] vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 punpcklbw m19, m20 pmaddubsw m19, m10, m19 pmulhrsw m19, m9 punpckhbw m21, m20, m5 packsswb m20{k1}, m19, m19 punpcklbw m20, m5, m20 call .add_noise_h sub hb, 2 jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] test sbyd, sbyd jnz .hv_overlap jmp .loop_x_h_overlap .v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ h, sby, see, overlap movzx r6d, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, r6d, 173 * 0x00010001 imul r6d, 37 * 0x01000100 add r7d, (105 << 16) | 188 add r6d, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and r6d, 0xff00ff00 xor seed, r7d xor seed, r6d ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offxd, [offyq+offxq*2+0x10001*829+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, overlap, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu ym19, [grain_lutq+offxyq-82] vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 movu ym21, [grain_lutq+top_offxyq-82] vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 punpckhbw m20, m21, m19 punpcklbw m21, m19 call .add_noise_v sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to .v_overlap, and instead always fall-through to h+v overlap .hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, left_offxy, top_offxy, topleft_offxy mov topleft_offxyd, top_offxyd rorx offyd, seed, 8 mov left_offxyd, offxd rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offxd, [offyq+offxq*2+0x10001*829+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu ym19, [grain_lutq+offxyq-82] vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 movd xm16, [grain_lutq+left_offxyq-50] vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 movu ym21, [grain_lutq+top_offxyq-82] vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 movd xm17, [grain_lutq+topleft_offxyq-50] vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m16, m19 pmaddubsw m16, m10, m16 punpcklbw m17, m21 pmaddubsw m17, m10, m17 punpckhbw m20, m21, m19 pmulhrsw m16, m9 pmulhrsw m17, m9 packsswb m19{k1}, m16, m16 packsswb m21{k1}, m17, m17 ; followed by v interpolation (top | cur -> cur) punpcklbw m21, m19 call .add_noise_v sub hb, 2 jg .loop_y_h_overlap add wq, 32 lea srcq, [src_bakq+wq] jl .hv_overlap .end: RET ALIGN function_align .add_noise_v: pmaddubsw m20, m12, m20 pmaddubsw m21, m12, m21 pmulhrsw m20, m9 pmulhrsw m21, m9 packsswb m21, m20 .add_noise: punpcklbw m20, m5, m21 punpckhbw m21, m5 .add_noise_h: mova ym18, [srcq+strideq*0] vinserti32x8 m18, [srcq+strideq*1], 1 mova m19, m0 punpcklbw m16, m18, m5 vpermt2b m19, m18, m1 ; scaling[ 0..127] vpmovb2m k2, m18 punpckhbw m17, m18, m5 vpermi2b m18, m2, m3 ; scaling[128..255] vmovdqu8 m19{k2}, m18 ; scaling[src] pshufb m19, m4 pmaddubsw m18, m19, m20 pmaddubsw m19, m21 add grain_lutq, 82*2 pmulhrsw m18, m6 ; noise pmulhrsw m19, m6 paddw m16, m18 paddw m17, m19 packuswb m16, m17 pmaxub m16, m7 pminub m16, m8 mova [dstq+srcq], ym16 add srcq, strideq vextracti32x8 [dstq+srcq], m16, 1 add srcq, strideq ret %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ scaling, grain_lut, h, sby, luma, \ overlap, uv_pl, is_id, _, stride3 lea r11, [fg_min] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, is_idm mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] %if %2 mov r12, 0x000f000f000f000f ; h_overlap mask vpbroadcastq m10, [base+pb_23_22_0_32] lea stride3q, [strideq*3] %else mov r12, 0x0000000f0000000f vpbroadcastq m10, [base+pb_27_17_17_27] %endif mova m0, [scalingq+64*0] mova m1, [scalingq+64*1] mova m2, [scalingq+64*2] mova m3, [scalingq+64*3] kmovq k1, r12 vbroadcasti32x4 m4, [base+interleave_hl] vpbroadcastd m6, [base+noise_rnd+r6*4-32] vpbroadcastd m7, [base+fg_min+r7*4] shlx r7d, r7d, r9d vpbroadcastd m8, [base+fg_max+r7*4] test sbyd, sbyd setnz r7b vpbroadcastd m9, [base+pw_1024] mova m11, [base+pb_even] mova m12, [base+pb_odd] pxor m5, m5 mov r5, r10mp ; lstride cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ h, sby, see, overlap, uv_pl, _, _, stride3 %if %1 mov r6d, uv_plm vpbroadcastd m16, [base+pw_8] vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m16 ; uv_luma_mult, uv_mult %endif test r7b, overlapb jnz %%v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, overlap, _, _, _, stride3 mov lumaq, r9mp lea r11, [srcq+wq] lea r12, [dstq+wq] lea r13, [lumaq+wq*(1+%2)] mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, overlap, _, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: %if %2 movu xm21, [grain_lutq+offxyq+82*0] vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 %endif call %%add_noise sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x cmp dword r8m, 0 ; sby jne %%hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, left_offxy, _, _, _, stride3 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, left_offxy, _, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: %if %2 movu xm20, [grain_lutq+offxyq +82*0] movd xm19, [grain_lutq+left_offxyq+82*0] vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 %else movu ym20, [grain_lutq+offxyq + 0] movd xm19, [grain_lutq+left_offxyq+ 0] vinserti32x8 m20, [grain_lutq+offxyq +82], 1 vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 %endif punpcklbw m19, m20 pmaddubsw m19, m10, m19 punpckhbw m21, m20, m5 pmulhrsw m19, m9 vpacksswb m20{k1}, m19, m19 punpcklbw m20, m5, m20 call %%add_noise_h sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq cmp dword r8m, 0 ; sby jne %%hv_overlap jmp %%loop_x_h_overlap %%v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ _, sby, see, overlap, _, _, _, stride3 movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed %if %3 vpbroadcastd m13, [base+pb_23_22] kxnorw k3, k3, k3 ; v_overlap mask %elif %2 vbroadcasti32x8 m13, [base+pb_27_17] kxnord k3, k3, k3 pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 %else vpbroadcastd ym16, [base+pb_27_17] vpbroadcastd m13, [base+pb_17_27] vmovdqa64 m13{k1}, m16 %endif DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, overlap, top_offxy, _, _, stride3 mov lumaq, r9mp lea r11, [srcq+wq] lea r12, [dstq+wq] lea r13, [lumaq+wq*(1<<%2)] mov r11mp, r11 mov r12mp, r12 neg wq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0x000f000f and offxd, 0x000f000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, overlap, top_offxy, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %3 movu xm18, [grain_lutq+offxyq+82*0] movu xm20, [grain_lutq+top_offxyq+82*0] ; only interpolate first line, insert remaining line unmodified vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 punpcklbw xm19, xm20, xm18 punpckhbw xm20, xm18 %elif %2 movu xm18, [grain_lutq+offxyq+82*0] vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 movu xm20, [grain_lutq+top_offxyq+82*0] vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 punpcklbw ym19, ym20, ym18 punpckhbw ym20, ym18 %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 movu ym20, [grain_lutq+top_offxyq+82*0] vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq %%hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 lea topleft_offxyd, [top_offxyq+(32>>%2)] lea left_offxyd, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0x000f000f and offxd, 0x000f000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 movu xm21, [grain_lutq+offxyq+82*0] movd xm16, [grain_lutq+left_offxyq+82*0] vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 movd xm18, [grain_lutq+topleft_offxyq+82*0] movu xm20, [grain_lutq+top_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m16, m21 %if %3 punpcklbw xm18, xm20 %else vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 punpcklbw ym18, ym20 %endif punpcklqdq m16, m18 pmaddubsw m16, m10, m16 pmulhrsw m16, m9 packsswb m16, m16 vmovdqu8 m21{k1}, m16 %if %3 vpalignr xm20{k1}, xm16, xm16, 4 punpcklbw xm19, xm20, xm21 punpckhbw xm20, xm21 %else vpalignr ym20{k1}, ym16, ym16, 4 punpcklbw ym19, ym20, ym21 punpckhbw ym20, ym21 %endif %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 movd xm16, [grain_lutq+left_offxyq+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 movu ym20, [grain_lutq+top_offxyq+82*0] vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 movd xm18, [grain_lutq+topleft_offxyq+82*0] vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 punpcklbw m16, m21 punpcklbw m18, m20 punpcklqdq m16, m18 pmaddubsw m16, m10, m16 pmulhrsw m16, m9 packsswb m16, m16 vpalignr m20{k1}, m16, m16, 4 vmovdqu8 m21{k1}, m16 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq jmp %%hv_overlap ALIGN function_align %%add_noise_v: %if %3 pmaddubsw xm19, xm13, xm19 pmaddubsw xm20, xm13, xm20 pmulhrsw xm19, xm9 pmulhrsw xm20, xm9 vpacksswb m21{k3}, m19, m20 %elif %2 pmaddubsw ym19, ym13, ym19 pmaddubsw ym20, ym13, ym20 pmulhrsw ym19, ym9 pmulhrsw ym20, ym9 vpacksswb m21{k3}, m19, m20 %else punpcklbw m19, m20, m21 punpckhbw m20, m21 pmaddubsw m19, m13, m19 pmaddubsw m20, m13, m20 pmulhrsw m19, m9 pmulhrsw m20, m9 packsswb m21, m19, m20 %endif %%add_noise: punpcklbw m20, m5, m21 punpckhbw m21, m5 %%add_noise_h: mova ym18, [lumaq+lstrideq*(0<<%3)] vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 %if %2 lea lumaq, [lumaq+lstrideq*(2<<%3)] mova ym16, [lumaq+lstrideq*(0<<%3)] vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 mova xm17, [srcq+strideq*0] mova m19, m11 vpermi2b m19, m18, m16 vinserti128 ym17, [srcq+strideq*1], 1 vpermt2b m18, m12, m16 vinserti32x4 m17, [srcq+strideq*2], 2 pavgb m18, m19 vinserti32x4 m17, [srcq+stride3q ], 3 %else mova ym17, [srcq+strideq*0] vinserti32x8 m17, [srcq+strideq*1], 1 %endif %if %1 punpckhbw m19, m18, m17 punpcklbw m18, m17 ; { luma, chroma } pmaddubsw m19, m14 pmaddubsw m18, m14 psraw m19, 6 psraw m18, 6 paddw m19, m15 paddw m18, m15 packuswb m18, m19 .add_noise_main: mova m19, m0 vpermt2b m19, m18, m1 ; scaling[ 0..127] vpmovb2m k2, m18 vpermi2b m18, m2, m3 ; scaling[128..255] vmovdqu8 m19{k2}, m18 ; scaling[src] pshufb m19, m4 pmaddubsw m18, m19, m20 pmaddubsw m19, m21 add grain_lutq, 82*2<<%2 lea lumaq, [lumaq+lstrideq*(2<<%3)] lea srcq, [srcq+strideq*(2<<%2)] pmulhrsw m18, m6 ; noise pmulhrsw m19, m6 punpcklbw m16, m17, m5 ; chroma punpckhbw m17, m5 paddw m16, m18 paddw m17, m19 packuswb m16, m17 pmaxub m16, m7 pminub m16, m8 %if %2 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+stride3q ], m16, 3 %else mova [dstq+strideq*0], ym16 vextracti32x8 [dstq+strideq*1], m16, 1 %endif lea dstq, [dstq+strideq*(2<<%2)] ret %else jmp .add_noise_main %endif %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64