https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71 diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h --- a/gfx/cairo/libpixman/src/pixman-arm-asm.h +++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h @@ -21,17 +21,33 @@ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Jeff Muizelaar (jeff@infidigm.net) * */ /* Supplementary macro for setting function attributes */ -.macro pixman_asm_function fname - .func fname - .global fname +.macro pixman_asm_function_impl fname +#ifdef ASM_HAVE_FUNC_DIRECTIVE + .func \fname +#endif + .global \fname #ifdef __ELF__ - .hidden fname - .type fname, %function + .hidden \fname + .type \fname, %function #endif -fname: +\fname: .endm + +.macro pixman_asm_function fname +#ifdef ASM_LEADING_UNDERSCORE + pixman_asm_function_impl _\fname +#else + pixman_asm_function_impl \fname +#endif +.endm + +.macro pixman_end_asm_function +#ifdef ASM_HAVE_FUNC_DIRECTIVE + .endfunc +#endif +.endm diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S @@ -72,219 +72,219 @@ * format conversion, and interpolation as separate macros which can be used * as the basic building blocks for constructing bilinear scanline functions. */ .macro bilinear_load_8888 reg1, reg2, tmp asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #2 - ld1 {®1&.2s}, [TMP1], STRIDE - ld1 {®2&.2s}, [TMP1] + ld1 {\()\reg1\().2s}, [TMP1], STRIDE + ld1 {\()\reg2\().2s}, [TMP1] .endm .macro bilinear_load_0565 reg1, reg2, tmp asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - ld1 {®2&.s}[0], [TMP1], STRIDE - ld1 {®2&.s}[1], [TMP1] - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp + ld1 {\()\reg2\().s}[0], [TMP1], STRIDE + ld1 {\()\reg2\().s}[1], [TMP1] + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp .endm .macro bilinear_load_and_vertical_interpolate_two_8888 \ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - bilinear_load_8888 reg1, reg2, tmp1 - umull &acc1&.8h, ®1&.8b, v28.8b - umlal &acc1&.8h, ®2&.8b, v29.8b - bilinear_load_8888 reg3, reg4, tmp2 - umull &acc2&.8h, ®3&.8b, v28.8b - umlal &acc2&.8h, ®4&.8b, v29.8b + bilinear_load_8888 \reg1, \reg2, \tmp1 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b + bilinear_load_8888 \reg3, \reg4, \tmp2 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi .endm .macro vzip reg1, reg2 - zip1 v24.8b, reg1, reg2 - zip2 reg2, reg1, reg2 - mov reg1, v24.8b + zip1 v24.8b, \reg1, \reg2 + zip2 \reg2, \reg1, \reg2 + mov \reg1, v24.8b .endm .macro vuzp reg1, reg2 - uzp1 v24.8b, reg1, reg2 - uzp2 reg2, reg1, reg2 - mov reg1, v24.8b + uzp1 v24.8b, \reg1, \reg2 + uzp2 \reg2, \reg1, \reg2 + mov \reg1, v24.8b .endm .macro bilinear_load_and_vertical_interpolate_two_0565 \ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - ld1 {&acc2&.s}[0], [TMP1], STRIDE - ld1 {&acc2&.s}[2], [TMP2], STRIDE - ld1 {&acc2&.s}[1], [TMP1] - ld1 {&acc2&.s}[3], [TMP2] - convert_0565_to_x888 acc2, reg3, reg2, reg1 - vzip ®1&.8b, ®3&.8b - vzip ®2&.8b, ®4&.8b - vzip ®3&.8b, ®4&.8b - vzip ®1&.8b, ®2&.8b - umull &acc1&.8h, ®1&.8b, v28.8b - umlal &acc1&.8h, ®2&.8b, v29.8b - umull &acc2&.8h, ®3&.8b, v28.8b - umlal &acc2&.8h, ®4&.8b, v29.8b + ld1 {\()\acc2\().s}[0], [TMP1], STRIDE + ld1 {\()\acc2\().s}[2], [TMP2], STRIDE + ld1 {\()\acc2\().s}[1], [TMP1] + ld1 {\()\acc2\().s}[3], [TMP2] + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 + vzip \()\reg1\().8b, \()\reg3\().8b + vzip \()\reg2\().8b, \()\reg4\().8b + vzip \()\reg3\().8b, \()\reg4\().8b + vzip \()\reg1\().8b, \()\reg2\().8b + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_0565 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - ld1 {&xacc2&.s}[0], [TMP1], STRIDE - ld1 {&xacc2&.s}[2], [TMP2], STRIDE - ld1 {&xacc2&.s}[1], [TMP1] - ld1 {&xacc2&.s}[3], [TMP2] - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 + ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE + ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE + ld1 {\()\xacc2\().s}[1], [TMP1] + ld1 {\()\xacc2\().s}[3], [TMP2] + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 asr WTMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr WTMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - ld1 {&yacc2&.s}[0], [TMP1], STRIDE - vzip &xreg1&.8b, &xreg3&.8b - ld1 {&yacc2&.s}[2], [TMP2], STRIDE - vzip &xreg2&.8b, &xreg4&.8b - ld1 {&yacc2&.s}[1], [TMP1] - vzip &xreg3&.8b, &xreg4&.8b - ld1 {&yacc2&.s}[3], [TMP2] - vzip &xreg1&.8b, &xreg2&.8b - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 - umull &xacc1&.8h, &xreg1&.8b, v28.8b - vzip &yreg1&.8b, &yreg3&.8b - umlal &xacc1&.8h, &xreg2&.8b, v29.8b - vzip &yreg2&.8b, &yreg4&.8b - umull &xacc2&.8h, &xreg3&.8b, v28.8b - vzip &yreg3&.8b, &yreg4&.8b - umlal &xacc2&.8h, &xreg4&.8b, v29.8b - vzip &yreg1&.8b, &yreg2&.8b - umull &yacc1&.8h, &yreg1&.8b, v28.8b - umlal &yacc1&.8h, &yreg2&.8b, v29.8b - umull &yacc2&.8h, &yreg3&.8b, v28.8b - umlal &yacc2&.8h, &yreg4&.8b, v29.8b + ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE + vzip \()\xreg1\().8b, \()\xreg3\().8b + ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE + vzip \()\xreg2\().8b, \()\xreg4\().8b + ld1 {\()\yacc2\().s}[1], [TMP1] + vzip \()\xreg3\().8b, \()\xreg4\().8b + ld1 {\()\yacc2\().s}[3], [TMP2] + vzip \()\xreg1\().8b, \()\xreg2\().8b + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 + umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b + vzip \()\yreg1\().8b, \()\yreg3\().8b + umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b + vzip \()\yreg2\().8b, \()\yreg4\().8b + umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b + vzip \()\yreg3\().8b, \()\yreg4\().8b + umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b + vzip \()\yreg1\().8b, \()\yreg2\().8b + umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b + umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b + umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b + umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b .endm .macro bilinear_store_8888 numpix, tmp1, tmp2 -.if numpix == 4 +.if \numpix == 4 st1 {v0.2s, v1.2s}, [OUT], #16 -.elseif numpix == 2 +.elseif \numpix == 2 st1 {v0.2s}, [OUT], #8 -.elseif numpix == 1 +.elseif \numpix == 1 st1 {v0.s}[0], [OUT], #4 .else - .error bilinear_store_8888 numpix is unsupported + .error bilinear_store_8888 \numpix is unsupported .endif .endm .macro bilinear_store_0565 numpix, tmp1, tmp2 vuzp v0.8b, v1.8b vuzp v2.8b, v3.8b vuzp v1.8b, v3.8b vuzp v0.8b, v2.8b - convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 -.if numpix == 4 + convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 +.if \numpix == 4 st1 {v1.4h}, [OUT], #8 -.elseif numpix == 2 +.elseif \numpix == 2 st1 {v1.s}[0], [OUT], #4 -.elseif numpix == 1 +.elseif \numpix == 1 st1 {v1.h}[0], [OUT], #2 .else - .error bilinear_store_0565 numpix is unsupported + .error bilinear_store_0565 \numpix is unsupported .endif .endm /* * Macros for loading mask pixels into register 'mask'. * dup must be done in somewhere else. */ .macro bilinear_load_mask_x numpix, mask .endm .macro bilinear_load_mask_8 numpix, mask -.if numpix == 4 - ld1 {&mask&.s}[0], [MASK], #4 -.elseif numpix == 2 - ld1 {&mask&.h}[0], [MASK], #2 -.elseif numpix == 1 - ld1 {&mask&.b}[0], [MASK], #1 +.if \numpix == 4 + ld1 {\()\mask\().s}[0], [MASK], #4 +.elseif \numpix == 2 + ld1 {\()\mask\().h}[0], [MASK], #2 +.elseif \numpix == 1 + ld1 {\()\mask\().b}[0], [MASK], #1 .else - .error bilinear_load_mask_8 numpix is unsupported + .error bilinear_load_mask_8 \numpix is unsupported .endif - prfm PREFETCH_MODE, [MASK, #prefetch_offset] + prfum PREFETCH_MODE, [MASK, #(prefetch_offset)] .endm .macro bilinear_load_mask mask_fmt, numpix, mask - bilinear_load_mask_&mask_fmt numpix, mask + bilinear_load_mask_\mask_fmt \numpix, \mask .endm /* * Macros for loading destination pixels into register 'dst0' and 'dst1'. * Interleave should be done somewhere else. */ .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 .endm .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 .endm .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 -.if numpix == 4 - ld1 {&dst0&.2s, &dst1&.2s}, [OUT] -.elseif numpix == 2 - ld1 {&dst0&.2s}, [OUT] -.elseif numpix == 1 - ld1 {&dst0&.s}[0], [OUT] +.if \numpix == 4 + ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT] +.elseif \numpix == 2 + ld1 {\()\dst0\().2s}, [OUT] +.elseif \numpix == 1 + ld1 {\()\dst0\().s}[0], [OUT] .else - .error bilinear_load_dst_8888 numpix is unsupported + .error bilinear_load_dst_8888 \numpix is unsupported .endif - mov &dst01&.d[0], &dst0&.d[0] - mov &dst01&.d[1], &dst1&.d[0] + mov \()\dst01\().d[0], \()\dst0\().d[0] + mov \()\dst01\().d[1], \()\dst1\().d[0] prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] .endm .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 - bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 + bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 .endm /* * Macros for duplicating partially loaded mask to fill entire register. * We will apply mask to interleaved source pixels, that is * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) * So, we need to duplicate loaded mask into whole register. @@ -293,84 +293,85 @@ * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) * We can do some optimizations for this including last pixel cases. */ .macro bilinear_duplicate_mask_x numpix, mask .endm .macro bilinear_duplicate_mask_8 numpix, mask -.if numpix == 4 - dup &mask&.2s, &mask&.s[0] -.elseif numpix == 2 - dup &mask&.4h, &mask&.h[0] -.elseif numpix == 1 - dup &mask&.8b, &mask&.b[0] +.if \numpix == 4 + dup \()\mask\().2s, \()\mask\().s[0] +.elseif \numpix == 2 + dup \()\mask\().4h, \()\mask\().h[0] +.elseif \numpix == 1 + dup \()\mask\().8b, \()\mask\().b[0] .else - .error bilinear_duplicate_mask_8 is unsupported + .error bilinear_duplicate_\mask_8 is unsupported .endif .endm .macro bilinear_duplicate_mask mask_fmt, numpix, mask - bilinear_duplicate_mask_&mask_fmt numpix, mask + bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask .endm /* * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. * Interleave should be done when maks is enabled or operator is 'over'. */ .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 - vuzp &src0&.8b, &src1&.8b - vuzp &dst0&.8b, &dst1&.8b - vuzp &src0&.8b, &src1&.8b - vuzp &dst0&.8b, &dst1&.8b - mov &src01&.d[1], &src1&.d[0] - mov &src01&.d[0], &src0&.d[0] - mov &dst01&.d[1], &dst1&.d[0] - mov &dst01&.d[0], &dst0&.d[0] + vuzp \()\src0\().8b, \()\src1\().8b + vuzp \()\dst0\().8b, \()\dst1\().8b + vuzp \()\src0\().8b, \()\src1\().8b + vuzp \()\dst0\().8b, \()\dst1\().8b + mov \()\src01\().d[1], \()\src1\().d[0] + mov \()\src01\().d[0], \()\src0\().d[0] + mov \()\dst01\().d[1], \()\dst1\().d[0] + mov \()\dst01\().d[0], \()\dst0\().d[0] .endm .macro bilinear_interleave_src_dst_x_src \ numpix, src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_x_over \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm .macro bilinear_interleave_src_dst_x_add \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01 + + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm .macro bilinear_interleave_src_dst_8_src \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm .macro bilinear_interleave_src_dst_8_over \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm .macro bilinear_interleave_src_dst_8_add \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm .macro bilinear_interleave_src_dst \ mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave_src_dst_&mask_fmt&_&op \ - numpix, src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm /* * Macros for applying masks to src pixels. (see combine_mask_u() function) * src, dst should be in interleaved form. * mask register should be in form (m0, m1, m2, m3). */ @@ -378,191 +379,191 @@ numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 .endm .macro bilinear_apply_mask_to_src_8 \ numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 - umull &tmp01&.8h, &src0&.8b, &mask&.8b - umull &tmp23&.8h, &src1&.8b, &mask&.8b + umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b + umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b /* bubbles */ - urshr &tmp45&.8h, &tmp01&.8h, #8 - urshr &tmp67&.8h, &tmp23&.8h, #8 + urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 + urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 /* bubbles */ - raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h - raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h - mov &src01&.d[0], &src0&.d[0] - mov &src01&.d[1], &src1&.d[0] + raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h + raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h + mov \()\src01\().d[0], \()\src0\().d[0] + mov \()\src01\().d[1], \()\src1\().d[0] .endm .macro bilinear_apply_mask_to_src \ mask_fmt, numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 - bilinear_apply_mask_to_src_&mask_fmt \ - numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 + bilinear_apply_mask_to_src_\()\mask_fmt \ + \numpix, \src0, \src1, \src01, \mask, \ + \tmp01, \tmp23, \tmp45, \tmp67 .endm /* * Macros for combining src and destination pixels. * Interleave or not is depending on operator 'op'. */ .macro bilinear_combine_src \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 .endm .macro bilinear_combine_over \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - dup &tmp8&.2s, &src1&.s[1] + dup \()\tmp8\().2s, \()\src1\().s[1] /* bubbles */ - mvn &tmp8&.8b, &tmp8&.8b + mvn \()\tmp8\().8b, \()\tmp8\().8b /* bubbles */ - umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b + umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b /* bubbles */ - umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b + umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b /* bubbles */ - urshr &tmp45&.8h, &tmp01&.8h, #8 - urshr &tmp67&.8h, &tmp23&.8h, #8 + urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 + urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 /* bubbles */ - raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h - raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h - mov &dst01&.d[0], &dst0&.d[0] - mov &dst01&.d[1], &dst1&.d[0] + raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h + raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h + mov \()\dst01\().d[0], \()\dst0\().d[0] + mov \()\dst01\().d[1], \()\dst1\().d[0] /* bubbles */ - uqadd &src0&.8b, &dst0&.8b, &src0&.8b - uqadd &src1&.8b, &dst1&.8b, &src1&.8b - mov &src01&.d[0], &src0&.d[0] - mov &src01&.d[1], &src1&.d[0] + uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b + uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b + mov \()\src01\().d[0], \()\src0\().d[0] + mov \()\src01\().d[1], \()\src1\().d[0] .endm .macro bilinear_combine_add \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - uqadd &src0&.8b, &dst0&.8b, &src0&.8b - uqadd &src1&.8b, &dst1&.8b, &src1&.8b - mov &src01&.d[0], &src0&.d[0] - mov &src01&.d[1], &src1&.d[0] + uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b + uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b + mov \()\src01\().d[0], \()\src0\().d[0] + mov \()\src01\().d[1], \()\src1\().d[0] .endm .macro bilinear_combine \ op, numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - bilinear_combine_&op \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 + bilinear_combine_\()\op \ + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ + \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 .endm /* * Macros for final deinterleaving of destination pixels if needed. */ .macro bilinear_deinterleave numpix, dst0, dst1, dst01 - vuzp &dst0&.8b, &dst1&.8b + vuzp \()\dst0\().8b, \()\dst1\().8b /* bubbles */ - vuzp &dst0&.8b, &dst1&.8b - mov &dst01&.d[0], &dst0&.d[0] - mov &dst01&.d[1], &dst1&.d[0] + vuzp \()\dst0\().8b, \()\dst1\().8b + mov \()\dst01\().d[0], \()\dst0\().d[0] + mov \()\dst01\().d[1], \()\dst1\().d[0] .endm .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 .endm .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 - bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 + bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op - bilinear_load_&src_fmt v0, v1, v2 - bilinear_load_mask mask_fmt, 1, v4 - bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 + bilinear_load_\()\src_fmt v0, v1, v2 + bilinear_load_mask \mask_fmt, 1, v4 + bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9 umull v2.8h, v0.8b, v28.8b umlal v2.8h, v1.8b, v29.8b /* 5 cycles bubble */ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS umlsl v0.4s, v2.4h, v15.h[0] umlal2 v0.4s, v2.8h, v15.h[0] /* 5 cycles bubble */ - bilinear_duplicate_mask mask_fmt, 1, v4 + bilinear_duplicate_mask \mask_fmt, 1, v4 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ xtn v0.8b, v0.8h /* 1 cycle bubble */ bilinear_interleave_src_dst \ - mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 + \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9 bilinear_apply_mask_to_src \ - mask_fmt, 1, v0, v1, v0, v4, \ + \mask_fmt, 1, v0, v1, v0, v4, \ v3, v8, v10, v11 bilinear_combine \ - op, 1, v0, v1, v0, v18, v19, v9, \ + \op, 1, v0, v1, v0, v18, v19, v9, \ v3, v8, v10, v11, v5 - bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 - bilinear_store_&dst_fmt 1, v17, v18 + bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0 + bilinear_store_\()\dst_fmt 1, v17, v18 .endm .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op - bilinear_load_and_vertical_interpolate_two_&src_fmt \ + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ v1, v11, v18, v19, v20, v21, v22, v23 - bilinear_load_mask mask_fmt, 2, v4 - bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 + bilinear_load_mask \mask_fmt, 2, v4 + bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS umlsl v0.4s, v1.4h, v15.h[0] umlal2 v0.4s, v1.8h, v15.h[0] ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS umlsl v10.4s, v11.4h, v15.h[4] umlal2 v10.4s, v11.8h, v15.h[4] shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - bilinear_duplicate_mask mask_fmt, 2, v4 + bilinear_duplicate_mask \mask_fmt, 2, v4 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) add v12.8h, v12.8h, v13.8h xtn v0.8b, v0.8h bilinear_interleave_src_dst \ - mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 + \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9 bilinear_apply_mask_to_src \ - mask_fmt, 2, v0, v1, v0, v4, \ + \mask_fmt, 2, v0, v1, v0, v4, \ v3, v8, v10, v11 bilinear_combine \ - op, 2, v0, v1, v0, v18, v19, v9, \ + \op, 2, v0, v1, v0, v18, v19, v9, \ v3, v8, v10, v11, v5 - bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 - bilinear_store_&dst_fmt 2, v16, v17 + bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0 + bilinear_store_\()\dst_fmt 2, v16, v17 .endm .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op - bilinear_load_and_vertical_interpolate_four_&src_fmt \ - v1, v11, v4, v5, v6, v7, v22, v23 \ + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ + v1, v11, v4, v5, v6, v7, v22, v23, \ v3, v9, v16, v17, v20, v21, v18, v19 prfm PREFETCH_MODE, [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE prfm PREFETCH_MODE, [TMP1, PF_OFFS] ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS umlsl v0.4s, v1.4h, v15.h[0] umlal2 v0.4s, v1.8h, v15.h[0] ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS @@ -575,33 +576,33 @@ ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS umlsl v8.4s, v9.4h, v15.h[4] umlal2 v8.4s, v9.8h, v15.h[4] add v12.8h, v12.8h, v13.8h shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - bilinear_load_mask mask_fmt, 4, v4 - bilinear_duplicate_mask mask_fmt, 4, v4 + bilinear_load_mask \mask_fmt, 4, v4 + bilinear_duplicate_mask \mask_fmt, 4, v4 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) xtn v0.8b, v0.8h xtn v1.8b, v2.8h add v12.8h, v12.8h, v13.8h - bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 + bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21 bilinear_interleave_src_dst \ - mask_fmt, op, 4, v0, v1, v0, v2, v3, v11 + \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11 bilinear_apply_mask_to_src \ - mask_fmt, 4, v0, v1, v0, v4, \ + \mask_fmt, 4, v0, v1, v0, v4, \ v6, v8, v9, v10 bilinear_combine \ - op, 4, v0, v1, v0, v2, v3, v1, \ + \op, 4, v0, v1, v0, v2, v3, v1, \ v6, v8, v9, v10, v23 - bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 - bilinear_store_&dst_fmt 4, v6, v7 + bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0 + bilinear_store_\()\dst_fmt 4, v6, v7 .endm .set BILINEAR_FLAG_USE_MASK, 1 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 /* * Main template macro for generating NEON optimized bilinear scanline functions. * @@ -631,24 +632,24 @@ bilinear_process_four_pixels, \ bilinear_process_pixblock_head, \ bilinear_process_pixblock_tail, \ bilinear_process_pixblock_tail_head, \ pixblock_size, \ prefetch_distance, \ flags -pixman_asm_function fname -.if pixblock_size == 8 -.elseif pixblock_size == 4 +pixman_asm_function \fname +.if \pixblock_size == 8 +.elseif \pixblock_size == 4 .else .error unsupported pixblock size .endif -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 OUT .req x0 TOP .req x1 BOTTOM .req x2 WT .req x3 WWT .req w3 WB .req x4 WWB .req w4 X .req w5 @@ -694,32 +695,32 @@ pixman_asm_function fname PF_OFFS .req x12 TMP3 .req x13 WTMP3 .req w13 TMP4 .req x14 WTMP4 .req w14 STRIDE .req x15 DUMMY .req x30 - .set prefetch_offset, prefetch_distance + .set prefetch_offset, \prefetch_distance stp x29, x30, [sp, -16]! mov x29, sp sub x29, x29, 64 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 stp x10, x11, [x29, -80] stp x12, x13, [x29, -96] stp x14, x15, [x29, -112] str x8, [x29, -120] ldr w8, [x29, 16] sub sp, sp, 120 .endif - mov WTMP1, #prefetch_distance + mov WTMP1, #\prefetch_distance umull PF_OFFS, WTMP1, UX sub STRIDE, BOTTOM, TOP .unreq BOTTOM cmp WIDTH, #0 ble 300f @@ -730,73 +731,73 @@ pixman_asm_function fname mov v25.d[0], v12.d[1] mov v26.d[0], v13.d[0] add v25.4h, v25.4h, v26.4h mov v12.d[1], v25.d[0] /* ensure good destination alignment */ cmp WIDTH, #1 blt 100f - tst OUT, #(1 << dst_bpp_shift) + tst OUT, #(1 << \dst_bpp_shift) beq 100f ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) add v12.8h, v12.8h, v13.8h - bilinear_process_last_pixel + \bilinear_process_last_pixel sub WIDTH, WIDTH, #1 100: add v13.8h, v13.8h, v13.8h ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) add v12.8h, v12.8h, v13.8h cmp WIDTH, #2 blt 100f - tst OUT, #(1 << (dst_bpp_shift + 1)) + tst OUT, #(1 << (\dst_bpp_shift + 1)) beq 100f - bilinear_process_two_pixels + \bilinear_process_two_pixels sub WIDTH, WIDTH, #2 100: -.if pixblock_size == 8 +.if \pixblock_size == 8 cmp WIDTH, #4 blt 100f - tst OUT, #(1 << (dst_bpp_shift + 2)) + tst OUT, #(1 << (\dst_bpp_shift + 2)) beq 100f - bilinear_process_four_pixels + \bilinear_process_four_pixels sub WIDTH, WIDTH, #4 100: .endif - subs WIDTH, WIDTH, #pixblock_size + subs WIDTH, WIDTH, #\pixblock_size blt 100f - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) - bilinear_process_pixblock_head - subs WIDTH, WIDTH, #pixblock_size + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) + \bilinear_process_pixblock_head + subs WIDTH, WIDTH, #\pixblock_size blt 500f 0: - bilinear_process_pixblock_tail_head - subs WIDTH, WIDTH, #pixblock_size + \bilinear_process_pixblock_tail_head + subs WIDTH, WIDTH, #\pixblock_size bge 0b 500: - bilinear_process_pixblock_tail + \bilinear_process_pixblock_tail 100: -.if pixblock_size == 8 +.if \pixblock_size == 8 tst WIDTH, #4 beq 200f - bilinear_process_four_pixels + \bilinear_process_four_pixels 200: .endif /* handle the remaining trailing pixels */ tst WIDTH, #2 beq 200f - bilinear_process_two_pixels + \bilinear_process_two_pixels 200: tst WIDTH, #1 beq 300f - bilinear_process_last_pixel + \bilinear_process_last_pixel 300: -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 sub x29, x29, 64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ldp x10, x11, [x29, -80] ldp x12, x13, [x29, -96] ldp x14, x15, [x29, -112] mov sp, x29 ldp x29, x30, [sp], 16 @@ -824,21 +825,21 @@ 300: .unreq WIDTH .unreq TMP1 .unreq WTMP1 .unreq TMP2 .unreq PF_OFFS .unreq TMP3 .unreq TMP4 .unreq STRIDE -.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 .unreq MASK .endif -.endfunc +pixman_end_asm_function .endm /* src_8888_8_8888 */ .macro bilinear_src_8888_8_8888_process_last_pixel bilinear_interpolate_last_pixel 8888, 8, 8888, src .endm diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S @@ -262,64 +262,64 @@ uqadd v18.8b, v0.8b, v22.8b uqadd v19.8b, v1.8b, v23.8b shrn v6.8b, v4.8h, #8 fetch_src_pixblock shrn v7.8b, v4.8h, #3 sli v4.8h, v4.8h, #5 ushll v14.8h, v17.8b, #7 sli v14.8h, v14.8h, #1 - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 ushll v8.8h, v19.8b, #7 sli v8.8h, v8.8h, #1 - PF tst PF_CTL, #0xF + PF tst, PF_CTL, #0xF sri v6.8b, v6.8b, #5 - PF beq 10f - PF add PF_X, PF_X, #8 + PF beq, 10f + PF add, PF_X, PF_X, #8 10: mvn v3.8b, v3.8b - PF beq 10f - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF sub, PF_CTL, PF_CTL, #1 10: sri v7.8b, v7.8b, #6 shrn v30.8b, v4.8h, #2 umull v10.8h, v3.8b, v6.8b - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] umull v11.8h, v3.8b, v7.8b umull v12.8h, v3.8b, v30.8b - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] sri v14.8h, v8.8h, #5 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W ushll v9.8h, v18.8b, #7 sli v9.8h, v9.8h, #1 urshr v17.8h, v10.8h, #8 - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: urshr v19.8h, v11.8h, #8 urshr v18.8h, v12.8h, #8 - PF ble 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: sri v14.8h, v9.8h, #11 mov v28.d[0], v14.d[0] mov v29.d[0], v14.d[1] - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 10: raddhn v20.8b, v10.8h, v17.8h raddhn v23.8b, v11.8h, v19.8h - PF ble 10f - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_SRC, #1 + PF ble, 10f + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_SRC, #1 10: raddhn v22.8b, v12.8h, v18.8h st1 {v14.8h}, [DST_W], #16 .endm #else /* If we did not care much about the performance, we would just use this... */ @@ -469,42 +469,42 @@ generate_composite_function \ sri v14.8h, v8.8h, #5 sri v14.8h, v9.8h, #11 mov v28.d[0], v14.d[0] mov v29.d[0], v14.d[1] .endm .macro pixman_composite_src_8888_0565_process_pixblock_tail_head sri v14.8h, v8.8h, #5 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF fetch_src_pixblock - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: sri v14.8h, v9.8h, #11 mov v28.d[0], v14.d[0] mov v29.d[0], v14.d[1] - PF cmp PF_X, ORIG_W - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF cmp, PF_X, ORIG_W + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ushll v8.8h, v1.8b, #7 sli v8.8h, v8.8h, #1 st1 {v14.8h}, [DST_W], #16 - PF ble 10f - PF sub PF_X, PF_X, ORIG_W - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W + PF subs, PF_CTL, PF_CTL, #0x10 10: ushll v14.8h, v2.8b, #7 sli v14.8h, v14.8h, #1 - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 10: ushll v9.8h, v0.8b, #7 sli v9.8h, v9.8h, #1 .endm generate_composite_function \ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ @@ -561,41 +561,41 @@ generate_composite_function \ uqadd v31.8b, v3.8b, v7.8b .endm .macro pixman_composite_add_8_8_process_pixblock_tail .endm .macro pixman_composite_add_8_8_process_pixblock_tail_head fetch_src_pixblock - PF add PF_X, PF_X, #32 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #32 + PF tst, PF_CTL, #0xF ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 - PF beq 10f - PF add PF_X, PF_X, #32 - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF add, PF_X, PF_X, #32 + PF sub, PF_CTL, PF_CTL, #1 10: st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF cmp PF_X, ORIG_W - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] - PF ble 10f - PF sub PF_X, PF_X, ORIG_W - PF subs PF_CTL, PF_CTL, #0x10 + PF cmp, PF_X, ORIG_W + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W + PF subs, PF_CTL, PF_CTL, #0x10 10: uqadd v28.8b, v0.8b, v4.8b - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: uqadd v29.8b, v1.8b, v5.8b uqadd v30.8b, v2.8b, v6.8b uqadd v31.8b, v3.8b, v7.8b .endm generate_composite_function \ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ @@ -607,41 +607,41 @@ generate_composite_function \ pixman_composite_add_8_8_process_pixblock_head, \ pixman_composite_add_8_8_process_pixblock_tail, \ pixman_composite_add_8_8_process_pixblock_tail_head /******************************************************************************/ .macro pixman_composite_add_8888_8888_process_pixblock_tail_head fetch_src_pixblock - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF cmp PF_X, ORIG_W - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] - PF ble 10f - PF sub PF_X, PF_X, ORIG_W - PF subs PF_CTL, PF_CTL, #0x10 + PF cmp, PF_X, ORIG_W + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W + PF subs, PF_CTL, PF_CTL, #0x10 10: uqadd v28.8b, v0.8b, v4.8b - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: uqadd v29.8b, v1.8b, v5.8b uqadd v30.8b, v2.8b, v6.8b uqadd v31.8b, v3.8b, v7.8b .endm generate_composite_function \ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ @@ -684,55 +684,55 @@ generate_composite_function_single_scanl raddhn v29.8b, v15.8h, v9.8h raddhn v30.8b, v16.8h, v10.8h raddhn v31.8b, v17.8h, v11.8h .endm .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 urshr v14.8h, v8.8h, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF urshr v15.8h, v9.8h, #8 urshr v16.8h, v10.8h, #8 urshr v17.8h, v11.8h, #8 - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: raddhn v28.8b, v14.8h, v8.8h raddhn v29.8b, v15.8h, v9.8h - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W raddhn v30.8b, v16.8h, v10.8h raddhn v31.8b, v17.8h, v11.8h fetch_src_pixblock - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] mvn v22.8b, v3.8b - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v8.8h, v22.8b, v4.8b - PF ble 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: umull v9.8h, v22.8b, v5.8b - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 10: umull v10.8h, v22.8b, v6.8b - PF ble 10f - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF ble, 10f + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: umull v11.8h, v22.8b, v7.8b .endm generate_composite_function_single_scanline \ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ @@ -754,59 +754,59 @@ generate_composite_function_single_scanl uqadd v29.8b, v1.8b, v29.8b uqadd v30.8b, v2.8b, v30.8b uqadd v31.8b, v3.8b, v31.8b .endm .macro pixman_composite_over_8888_8888_process_pixblock_tail_head ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 urshr v14.8h, v8.8h, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF urshr v15.8h, v9.8h, #8 urshr v16.8h, v10.8h, #8 urshr v17.8h, v11.8h, #8 - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: raddhn v28.8b, v14.8h, v8.8h raddhn v29.8b, v15.8h, v9.8h - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W raddhn v30.8b, v16.8h, v10.8h raddhn v31.8b, v17.8h, v11.8h uqadd v28.8b, v0.8b, v28.8b uqadd v29.8b, v1.8b, v29.8b uqadd v30.8b, v2.8b, v30.8b uqadd v31.8b, v3.8b, v31.8b fetch_src_pixblock - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] mvn v22.8b, v3.8b - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v8.8h, v22.8b, v4.8b - PF ble 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: umull v9.8h, v22.8b, v5.8b - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 10: umull v10.8h, v22.8b, v6.8b - PF ble 10f - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF ble, 10f + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: umull v11.8h, v22.8b, v7.8b .endm generate_composite_function \ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ @@ -860,40 +860,40 @@ generate_composite_function_single_scanl urshr v16.8h, v10.8h, #8 urshr v17.8h, v11.8h, #8 raddhn v28.8b, v14.8h, v8.8h raddhn v29.8b, v15.8h, v9.8h raddhn v30.8b, v16.8h, v10.8h raddhn v31.8b, v17.8h, v11.8h ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 uqadd v28.8b, v0.8b, v28.8b - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0x0F - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0x0F + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: uqadd v29.8b, v1.8b, v29.8b uqadd v30.8b, v2.8b, v30.8b uqadd v31.8b, v3.8b, v31.8b - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W umull v8.8h, v24.8b, v4.8b - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] umull v9.8h, v24.8b, v5.8b - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v10.8h, v24.8b, v6.8b - PF subs PF_CTL, PF_CTL, #0x10 + PF subs, PF_CTL, PF_CTL, #0x10 umull v11.8h, v24.8b, v7.8b - PF ble 10f - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF ble, 10f + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 .endm .macro pixman_composite_over_n_8888_init mov v3.s[0], w4 dup v0.8b, v3.b[0] dup v1.8b, v3.b[1] @@ -912,52 +912,52 @@ generate_composite_function \ pixman_composite_over_8888_8888_process_pixblock_head, \ pixman_composite_over_8888_8888_process_pixblock_tail, \ pixman_composite_over_n_8888_process_pixblock_tail_head /******************************************************************************/ .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head urshr v14.8h, v8.8h, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF urshr v15.8h, v9.8h, #8 urshr v12.8h, v10.8h, #8 urshr v13.8h, v11.8h, #8 - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: raddhn v28.8b, v14.8h, v8.8h raddhn v29.8b, v15.8h, v9.8h - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W raddhn v30.8b, v12.8h, v10.8h raddhn v31.8b, v13.8h, v11.8h uqadd v28.8b, v0.8b, v28.8b uqadd v29.8b, v1.8b, v29.8b uqadd v30.8b, v2.8b, v30.8b uqadd v31.8b, v3.8b, v31.8b ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 mvn v22.8b, v3.8b - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF blt 10f - PF sub PF_X, PF_X, ORIG_W + PF blt, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v8.8h, v22.8b, v4.8b - PF blt 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF blt, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: umull v9.8h, v22.8b, v5.8b umull v10.8h, v22.8b, v6.8b - PF blt 10f - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF blt, 10f + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: umull v11.8h, v22.8b, v7.8b .endm .macro pixman_composite_over_reverse_n_8888_init mov v7.s[0], w4 dup v4.8b, v7.b[0] dup v5.8b, v7.b[1] @@ -1405,45 +1405,45 @@ generate_composite_function \ rshrn v28.8b, v8.8h, #8 rshrn v29.8b, v9.8h, #8 rshrn v30.8b, v10.8h, #8 rshrn v31.8b, v11.8h, #8 .endm .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head fetch_mask_pixblock - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 rshrn v28.8b, v8.8h, #8 - PF tst PF_CTL, #0x0F + PF tst, PF_CTL, #0x0F rshrn v29.8b, v9.8h, #8 - PF beq 10f - PF add PF_X, PF_X, #8 + PF beq, 10f + PF add, PF_X, PF_X, #8 10: rshrn v30.8b, v10.8h, #8 - PF beq 10f - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF sub, PF_CTL, PF_CTL, #1 10: rshrn v31.8b, v11.8h, #8 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W umull v8.8h, v24.8b, v0.8b - PF lsl DUMMY, PF_X, #mask_bpp_shift - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + PF lsl, DUMMY, PF_X, #mask_bpp_shift + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] umull v9.8h, v24.8b, v1.8b - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v10.8h, v24.8b, v2.8b - PF ble 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: umull v11.8h, v24.8b, v3.8b - PF ble 10f - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift - PF ldrsb DUMMY, [PF_MASK, DUMMY] - PF add PF_MASK, PF_MASK, #1 + PF ble, 10f + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb, DUMMY, [PF_MASK, DUMMY] + PF add, PF_MASK, PF_MASK, #1 10: st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ursra v8.8h, v8.8h, #8 ursra v9.8h, v9.8h, #8 ursra v10.8h, v10.8h, #8 ursra v11.8h, v11.8h, #8 .endm @@ -1486,45 +1486,45 @@ generate_composite_function \ rshrn v28.8b, v0.8h, #8 rshrn v29.8b, v1.8h, #8 rshrn v30.8b, v2.8h, #8 rshrn v31.8b, v3.8h, #8 .endm .macro pixman_composite_src_n_8_8_process_pixblock_tail_head fetch_mask_pixblock - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 rshrn v28.8b, v0.8h, #8 - PF tst PF_CTL, #0x0F + PF tst, PF_CTL, #0x0F rshrn v29.8b, v1.8h, #8 - PF beq 10f - PF add PF_X, PF_X, #8 + PF beq, 10f + PF add, PF_X, PF_X, #8 10: rshrn v30.8b, v2.8h, #8 - PF beq 10f - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF sub, PF_CTL, PF_CTL, #1 10: rshrn v31.8b, v3.8h, #8 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W umull v0.8h, v24.8b, v16.8b - PF lsl DUMMY, PF_X, mask_bpp_shift - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + PF lsl, DUMMY, PF_X, mask_bpp_shift + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] umull v1.8h, v25.8b, v16.8b - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v2.8h, v26.8b, v16.8b - PF ble 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: umull v3.8h, v27.8b, v16.8b - PF ble 10f - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift - PF ldrsb DUMMY, [PF_MASK, DUMMY] - PF add PF_MASK, PF_MASK, #1 + PF ble, 10f + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb, DUMMY, [PF_MASK, DUMMY] + PF add, PF_MASK, PF_MASK, #1 10: st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ursra v0.8h, v0.8h, #8 ursra v1.8h, v1.8h, #8 ursra v2.8h, v2.8h, #8 ursra v3.8h, v3.8h, #8 .endm @@ -1594,54 +1594,54 @@ generate_composite_function \ .endm .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head urshr v16.8h, v12.8h, #8 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 urshr v17.8h, v13.8h, #8 fetch_mask_pixblock urshr v18.8h, v14.8h, #8 - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 urshr v19.8h, v15.8h, #8 - PF tst PF_CTL, #0x0F + PF tst, PF_CTL, #0x0F raddhn v28.8b, v16.8h, v12.8h - PF beq 10f - PF add PF_X, PF_X, #8 + PF beq, 10f + PF add, PF_X, PF_X, #8 10: raddhn v29.8b, v17.8h, v13.8h - PF beq 10f - PF sub PF_CTL, PF_CTL, #1 + PF beq, 10f + PF sub, PF_CTL, PF_CTL, #1 10: raddhn v30.8b, v18.8h, v14.8h - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W raddhn v31.8b, v19.8h, v15.8h - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] umull v16.8h, v24.8b, v8.8b - PF lsl DUMMY, PF_X, #mask_bpp_shift - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + PF lsl, DUMMY, PF_X, #mask_bpp_shift + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] umull v17.8h, v24.8b, v9.8b - PF ble 10f - PF sub PF_X, PF_X, ORIG_W + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W 10: umull v18.8h, v24.8b, v10.8b - PF ble 10f - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF subs, PF_CTL, PF_CTL, #0x10 10: umull v19.8h, v24.8b, v11.8b - PF ble 10f - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF ble, 10f + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 10: uqadd v28.8b, v0.8b, v28.8b - PF ble 10f - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift - PF ldrsb DUMMY, [PF_MASK, DUMMY] - PF add PF_MASK, PF_MASK, #1 + PF ble, 10f + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb, DUMMY, [PF_MASK, DUMMY] + PF add, PF_MASK, PF_MASK, #1 10: uqadd v29.8b, v1.8b, v29.8b uqadd v30.8b, v2.8b, v30.8b uqadd v31.8b, v3.8b, v31.8b urshr v12.8h, v16.8h, #8 urshr v13.8h, v17.8h, #8 urshr v14.8h, v18.8h, #8 urshr v15.8h, v19.8h, #8 @@ -2407,17 +2407,17 @@ generate_composite_function \ generate_composite_function_single_scanline \ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ default_init_need_all_regs, \ default_cleanup_need_all_regs, \ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ - pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ + pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ 4, /* dst_r_basereg */ \ 0, /* src_basereg */ \ 12 /* mask_basereg */ /******************************************************************************/ .macro pixman_composite_over_8888_n_8888_process_pixblock_head @@ -2482,31 +2482,31 @@ generate_composite_function \ pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 5, /* prefetch distance */ \ default_init_need_all_regs, \ default_cleanup_need_all_regs, \ pixman_composite_over_8888_n_8888_process_pixblock_head, \ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ 4, /* dst_r_basereg */ \ 0, /* src_basereg */ \ 12 /* mask_basereg */ generate_composite_function_single_scanline \ pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ default_init_need_all_regs, \ default_cleanup_need_all_regs, \ pixman_composite_over_8888_n_8888_process_pixblock_head, \ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ 4, /* dst_r_basereg */ \ 0, /* src_basereg */ \ 12 /* mask_basereg */ /******************************************************************************/ /* TODO: expand macros and do better instructions scheduling */ @@ -2524,17 +2524,17 @@ generate_composite_function \ pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 5, /* prefetch distance */ \ default_init_need_all_regs, \ default_cleanup_need_all_regs, \ pixman_composite_over_8888_n_8888_process_pixblock_head, \ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ - pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ + pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \ 28, /* dst_w_basereg */ \ 4, /* dst_r_basereg */ \ 0, /* src_basereg */ \ 15 /* mask_basereg */ /******************************************************************************/ .macro pixman_composite_src_0888_0888_process_pixblock_head @@ -2675,38 +2675,38 @@ generate_composite_function \ urshr v11.8h, v8.8h, #8 mov v30.8b, v31.8b mov v31.8b, v3.8b mov v3.8b, v31.8b urshr v12.8h, v9.8h, #8 urshr v13.8h, v10.8h, #8 fetch_src_pixblock raddhn v30.8b, v11.8h, v8.8h - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: raddhn v29.8b, v12.8h, v9.8h raddhn v28.8b, v13.8h, v10.8h umull v8.8h, v3.8b, v0.8b umull v9.8h, v3.8b, v1.8b umull v10.8h, v3.8b, v2.8b st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF cmp PF_X, ORIG_W - PF lsl DUMMY, PF_X, src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] - PF ble 10f - PF sub PF_X, PF_X, ORIG_W - PF subs PF_CTL, PF_CTL, #0x10 - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF cmp, PF_X, ORIG_W + PF lsl, DUMMY, PF_X, src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W + PF subs, PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 10: .endm generate_composite_function \ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ @@ -2744,38 +2744,38 @@ generate_composite_function \ urshr v11.8h, v8.8h, #8 mov v30.8b, v31.8b mov v31.8b, v3.8b mov v3.8b, v30.8b urshr v12.8h, v9.8h, #8 urshr v13.8h, v10.8h, #8 fetch_src_pixblock raddhn v28.8b, v11.8h, v8.8h - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF beq 10f - PF add PF_X, PF_X, #8 - PF sub PF_CTL, PF_CTL, #1 + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF + PF beq, 10f + PF add, PF_X, PF_X, #8 + PF sub, PF_CTL, PF_CTL, #1 10: raddhn v29.8b, v12.8h, v9.8h raddhn v30.8b, v13.8h, v10.8h umull v8.8h, v3.8b, v0.8b umull v9.8h, v3.8b, v1.8b umull v10.8h, v3.8b, v2.8b st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - PF cmp PF_X, ORIG_W - PF lsl DUMMY, PF_X, src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] - PF ble 10f - PF sub PF_X, PF_X, ORIG_W - PF subs PF_CTL, PF_CTL, #0x10 - PF ble 10f - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF cmp, PF_X, ORIG_W + PF lsl, DUMMY, PF_X, src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + PF ble, 10f + PF sub, PF_X, PF_X, ORIG_W + PF subs, PF_CTL, PF_CTL, #0x10 + PF ble, 10f + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 10: .endm generate_composite_function \ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ @@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan * format conversion, and interpolation as separate macros which can be used * as the basic building blocks for constructing bilinear scanline functions. */ .macro bilinear_load_8888 reg1, reg2, tmp asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #2 - ld1 {®1&.2s}, [TMP1], STRIDE - ld1 {®2&.2s}, [TMP1] + ld1 {\()\reg1\().2s}, [TMP1], STRIDE + ld1 {\()\reg2\().2s}, [TMP1] .endm .macro bilinear_load_0565 reg1, reg2, tmp asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 - ld1 {®2&.s}[0], [TMP1], STRIDE - ld1 {®2&.s}[1], [TMP1] - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp + ld1 {\()\reg2\().s}[0], [TMP1], STRIDE + ld1 {\()\reg2\().s}[1], [TMP1] + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp .endm .macro bilinear_load_and_vertical_interpolate_two_8888 \ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - bilinear_load_8888 reg1, reg2, tmp1 - umull &acc1&.8h, ®1&.8b, v28.8b - umlal &acc1&.8h, ®2&.8b, v29.8b - bilinear_load_8888 reg3, reg4, tmp2 - umull &acc2&.8h, ®3&.8b, v28.8b - umlal &acc2&.8h, ®4&.8b, v29.8b + bilinear_load_8888 \reg1, \reg2, \tmp1 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b + bilinear_load_8888 \reg3, \reg4, \tmp2 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi .endm .macro vzip reg1, reg2 umov TMP4, v31.d[0] - zip1 v31.8b, reg1, reg2 - zip2 reg2, reg1, reg2 - mov reg1, v31.8b + zip1 v31.8b, \reg1, \reg2 + zip2 \reg2, \reg1, \reg2 + mov \reg1, v31.8b mov v31.d[0], TMP4 .endm .macro vuzp reg1, reg2 umov TMP4, v31.d[0] - uzp1 v31.8b, reg1, reg2 - uzp2 reg2, reg1, reg2 - mov reg1, v31.8b + uzp1 v31.8b, \reg1, \reg2 + uzp2 \reg2, \reg1, \reg2 + mov \reg1, v31.8b mov v31.d[0], TMP4 .endm .macro bilinear_load_and_vertical_interpolate_two_0565 \ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - ld1 {&acc2&.s}[0], [TMP1], STRIDE - ld1 {&acc2&.s}[2], [TMP2], STRIDE - ld1 {&acc2&.s}[1], [TMP1] - ld1 {&acc2&.s}[3], [TMP2] - convert_0565_to_x888 acc2, reg3, reg2, reg1 - vzip ®1&.8b, ®3&.8b - vzip ®2&.8b, ®4&.8b - vzip ®3&.8b, ®4&.8b - vzip ®1&.8b, ®2&.8b - umull &acc1&.8h, ®1&.8b, v28.8b - umlal &acc1&.8h, ®2&.8b, v29.8b - umull &acc2&.8h, ®3&.8b, v28.8b - umlal &acc2&.8h, ®4&.8b, v29.8b + ld1 {\()\acc2\().s}[0], [TMP1], STRIDE + ld1 {\()\acc2\().s}[2], [TMP2], STRIDE + ld1 {\()\acc2\().s}[1], [TMP1] + ld1 {\()\acc2\().s}[3], [TMP2] + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 + vzip \()\reg1\().8b, \()\reg3\().8b + vzip \()\reg2\().8b, \()\reg4\().8b + vzip \()\reg3\().8b, \()\reg4\().8b + vzip \()\reg1\().8b, \()\reg2\().8b + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b .endm .macro bilinear_load_and_vertical_interpolate_four_0565 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - ld1 {&xacc2&.s}[0], [TMP1], STRIDE - ld1 {&xacc2&.s}[2], [TMP2], STRIDE - ld1 {&xacc2&.s}[1], [TMP1] - ld1 {&xacc2&.s}[3], [TMP2] - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 + ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE + ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE + ld1 {\()\xacc2\().s}[1], [TMP1] + ld1 {\()\xacc2\().s}[3], [TMP2] + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 asr TMP1, X, #16 add X, X, UX add TMP1, TOP, TMP1, lsl #1 asr TMP2, X, #16 add X, X, UX add TMP2, TOP, TMP2, lsl #1 - ld1 {&yacc2&.s}[0], [TMP1], STRIDE - vzip &xreg1&.8b, &xreg3&.8b - ld1 {&yacc2&.s}[2], [TMP2], STRIDE - vzip &xreg2&.8b, &xreg4&.8b - ld1 {&yacc2&.s}[1], [TMP1] - vzip &xreg3&.8b, &xreg4&.8b - ld1 {&yacc2&.s}[3], [TMP2] - vzip &xreg1&.8b, &xreg2&.8b - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 - umull &xacc1&.8h, &xreg1&.8b, v28.8b - vzip &yreg1&.8b, &yreg3&.8b - umlal &xacc1&.8h, &xreg2&.8b, v29.8b - vzip &yreg2&.8b, &yreg4&.8b - umull &xacc2&.8h, &xreg3&.8b, v28.8b - vzip &yreg3&.8b, &yreg4&.8b - umlal &xacc2&.8h, &xreg4&.8b, v29.8b - vzip &yreg1&.8b, &yreg2&.8b - umull &yacc1&.8h, &yreg1&.8b, v28.8b - umlal &yacc1&.8h, &yreg2&.8b, v29.8b - umull &yacc2&.8h, &yreg3&.8b, v28.8b - umlal &yacc2&.8h, &yreg4&.8b, v29.8b + ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE + vzip \()\xreg1\().8b, \()\xreg3\().8b + ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE + vzip \()\xreg2\().8b, \()\xreg4\().8b + ld1 {\()\yacc2\().s}[1], [TMP1] + vzip \()\xreg3\().8b, \()\xreg4\().8b + ld1 {\()\yacc2\().s}[3], [TMP2] + vzip \()\xreg1\().8b, \()\xreg2\().8b + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 + umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b + vzip \()\yreg1\().8b, \()\yreg3\().8b + umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b + vzip \()\yreg2\().8b, \()\yreg4\().8b + umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b + vzip \()\yreg3\().8b, \()\yreg4\().8b + umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b + vzip \()\yreg1\().8b, \()\yreg2\().8b + umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b + umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b + umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b + umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b .endm .macro bilinear_store_8888 numpix, tmp1, tmp2 -.if numpix == 4 +.if \numpix == 4 st1 {v0.2s, v1.2s}, [OUT], #16 -.elseif numpix == 2 +.elseif \numpix == 2 st1 {v0.2s}, [OUT], #8 -.elseif numpix == 1 +.elseif \numpix == 1 st1 {v0.s}[0], [OUT], #4 .else - .error bilinear_store_8888 numpix is unsupported + .error bilinear_store_8888 \numpix is unsupported .endif .endm .macro bilinear_store_0565 numpix, tmp1, tmp2 vuzp v0.8b, v1.8b vuzp v2.8b, v3.8b vuzp v1.8b, v3.8b vuzp v0.8b, v2.8b - convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 -.if numpix == 4 + convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 +.if \numpix == 4 st1 {v1.4h}, [OUT], #8 -.elseif numpix == 2 +.elseif \numpix == 2 st1 {v1.s}[0], [OUT], #4 -.elseif numpix == 1 +.elseif \numpix == 1 st1 {v1.h}[0], [OUT], #2 .else - .error bilinear_store_0565 numpix is unsupported + .error bilinear_store_0565 \numpix is unsupported .endif .endm .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt - bilinear_load_&src_fmt v0, v1, v2 + bilinear_load_\()\src_fmt v0, v1, v2 umull v2.8h, v0.8b, v28.8b umlal v2.8h, v1.8b, v29.8b /* 5 cycles bubble */ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS umlsl v0.4s, v2.4h, v15.h[0] umlal2 v0.4s, v2.8h, v15.h[0] /* 5 cycles bubble */ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ xtn v0.8b, v0.8h /* 1 cycle bubble */ - bilinear_store_&dst_fmt 1, v3, v4 + bilinear_store_\()\dst_fmt 1, v3, v4 .endm .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt - bilinear_load_and_vertical_interpolate_two_&src_fmt \ + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ v1, v11, v2, v3, v20, v21, v22, v23 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS umlsl v0.4s, v1.4h, v15.h[0] umlal2 v0.4s, v1.8h, v15.h[0] ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS umlsl v10.4s, v11.4h, v15.h[4] umlal2 v10.4s, v11.8h, v15.h[4] shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) add v12.8h, v12.8h, v13.8h xtn v0.8b, v0.8h - bilinear_store_&dst_fmt 2, v3, v4 + bilinear_store_\()\dst_fmt 2, v3, v4 .endm .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt - bilinear_load_and_vertical_interpolate_four_&src_fmt \ - v1, v11, v14, v20, v16, v17, v22, v23 \ + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ + v1, v11, v14, v20, v16, v17, v22, v23, \ v3, v9, v24, v25, v26, v27, v18, v19 prfm PREFETCH_MODE, [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS umlsl v0.4s, v1.4h, v15.h[0] umlal2 v0.4s, v1.8h, v15.h[0] ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS umlsl v10.4s, v11.4h, v15.h[4] @@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) xtn v0.8b, v0.8h xtn v1.8b, v2.8h add v12.8h, v12.8h, v13.8h - bilinear_store_&dst_fmt 4, v3, v4 + bilinear_store_\()\dst_fmt 4, v3, v4 .endm .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head .else - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail .endif .endm .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head .else - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head .else - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail .else - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head .else - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt .endif .endm .set BILINEAR_FLAG_UNROLL_4, 0 .set BILINEAR_FLAG_UNROLL_8, 1 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 /* @@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan * prefetch_distance - prefetch in the source image by that many * pixels ahead */ .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ src_bpp_shift, dst_bpp_shift, \ prefetch_distance, flags -pixman_asm_function fname +pixman_asm_function \fname OUT .req x0 TOP .req x1 BOTTOM .req x2 WT .req x3 WB .req x4 X .req x5 UX .req x6 WIDTH .req x7 @@ -3437,17 +3437,17 @@ pixman_asm_function fname sub sp, sp, 112 /* push all registers */ sub x29, x29, 64 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 stp x8, x9, [x29, -80] stp x10, x11, [x29, -96] stp x12, x13, [x29, -112] - mov PF_OFFS, #prefetch_distance + mov PF_OFFS, #\prefetch_distance mul PF_OFFS, PF_OFFS, UX subs STRIDE, BOTTOM, TOP .unreq BOTTOM cmp WIDTH, #0 ble 300f @@ -3458,85 +3458,85 @@ pixman_asm_function fname mov v25.d[0], v12.d[1] mov v26.d[0], v13.d[0] add v25.4h, v25.4h, v26.4h mov v12.d[1], v25.d[0] /* ensure good destination alignment */ cmp WIDTH, #1 blt 100f - tst OUT, #(1 << dst_bpp_shift) + tst OUT, #(1 << \dst_bpp_shift) beq 100f ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) add v12.8h, v12.8h, v13.8h - bilinear_interpolate_last_pixel src_fmt, dst_fmt + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt sub WIDTH, WIDTH, #1 100: add v13.8h, v13.8h, v13.8h ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) add v12.8h, v12.8h, v13.8h cmp WIDTH, #2 blt 100f - tst OUT, #(1 << (dst_bpp_shift + 1)) + tst OUT, #(1 << (\dst_bpp_shift + 1)) beq 100f - bilinear_interpolate_two_pixels src_fmt, dst_fmt + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt sub WIDTH, WIDTH, #2 100: -.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 +.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 /*********** 8 pixels per iteration *****************/ cmp WIDTH, #4 blt 100f - tst OUT, #(1 << (dst_bpp_shift + 2)) + tst OUT, #(1 << (\dst_bpp_shift + 2)) beq 100f - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt sub WIDTH, WIDTH, #4 100: subs WIDTH, WIDTH, #8 blt 100f - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) - bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) + bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #8 blt 500f 1000: - bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #8 bge 1000b 500: - bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt + bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt 100: tst WIDTH, #4 beq 200f - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 200: .else /*********** 4 pixels per iteration *****************/ subs WIDTH, WIDTH, #4 blt 100f - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #4 blt 500f 1000: - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #4 bge 1000b 500: - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt 100: /****************************************************/ .endif /* handle the remaining trailing pixels */ tst WIDTH, #2 beq 200f - bilinear_interpolate_two_pixels src_fmt, dst_fmt + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt 200: tst WIDTH, #1 beq 300f - bilinear_interpolate_last_pixel src_fmt, dst_fmt + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt 300: sub x29, x29, 64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 ldp x8, x9, [x29, -80] ldp x10, x11, [x29, -96] ldp x12, x13, [x29, -104] mov sp, x29 @@ -3551,17 +3551,17 @@ 300: .unreq UX .unreq WIDTH .unreq TMP1 .unreq TMP2 .unreq PF_OFFS .unreq TMP3 .unreq TMP4 .unreq STRIDE -.endfunc +pixman_end_asm_function .endm /*****************************************************************************/ .set have_bilinear_interpolate_four_pixels_8888_8888, 1 .macro bilinear_interpolate_four_pixels_8888_8888_head diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h @@ -75,340 +75,340 @@ #define PREFETCH_MODE pldl1keep /* * Definitions of supplementary pixld/pixst macros (for partial load/store of * pixel data). */ .macro pixldst1 op, elem_size, reg1, mem_operand, abits - op {v®1&.&elem_size}, [&mem_operand&], #8 + \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8 .endm .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits - op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16 .endm .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits - op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32 .endm .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes - op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& + \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\() .endm .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand - op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24 .endm .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand - op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3 .endm .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits -.if numbytes == 32 - .if elem_size==32 - pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits - .elseif elem_size==16 - pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits +.if \numbytes == 32 + .if \elem_size==32 + pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits + .elseif \elem_size==16 + pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits .else - pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits + pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits .endif -.elseif numbytes == 16 - .if elem_size==32 - pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits - .elseif elem_size==16 - pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits +.elseif \numbytes == 16 + .if \elem_size==32 + pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits + .elseif \elem_size==16 + pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits .else - pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits + pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits .endif -.elseif numbytes == 8 - .if elem_size==32 - pixldst1 op, 2s, %(basereg+1), mem_operand, abits - .elseif elem_size==16 - pixldst1 op, 4h, %(basereg+1), mem_operand, abits +.elseif \numbytes == 8 + .if \elem_size==32 + pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits + .elseif \elem_size==16 + pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits .else - pixldst1 op, 8b, %(basereg+1), mem_operand, abits + pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits .endif -.elseif numbytes == 4 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) - pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 - .elseif elem_size == 16 - pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 - pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 +.elseif \numbytes == 4 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) + pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4 + .elseif \elem_size == 16 + pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2 + pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2 .else - pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 - pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 - pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 - pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 + pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1 + pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1 + pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1 + pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1 .endif -.elseif numbytes == 2 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) - pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 +.elseif \numbytes == 2 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) + pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2 .else - pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 - pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 + pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1 + pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1 .endif -.elseif numbytes == 1 - pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 +.elseif \numbytes == 1 + pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1 .else - .error "unsupported size: numbytes" + .error "unsupported size: \numbytes" .endif .endm .macro pixld numpix, bpp, basereg, mem_operand, abits=0 -.if bpp > 0 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif (bpp == 24) && (numpix == 8) - pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand -.elseif (bpp == 24) && (numpix == 4) - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand -.elseif (bpp == 24) && (numpix == 2) - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand -.elseif (bpp == 24) && (numpix == 1) - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.if \bpp > 0 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits +.elseif (\bpp == 24) && (\numpix == 8) + pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand +.elseif (\bpp == 24) && (\numpix == 4) + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand +.elseif (\bpp == 24) && (\numpix == 2) + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand +.elseif (\bpp == 24) && (\numpix == 1) + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand .else - pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits + pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits .endif .endif .endm .macro pixst numpix, bpp, basereg, mem_operand, abits=0 -.if bpp > 0 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif (bpp == 24) && (numpix == 8) - pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand -.elseif (bpp == 24) && (numpix == 4) - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand -.elseif (bpp == 24) && (numpix == 2) - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand -.elseif (bpp == 24) && (numpix == 1) - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand -.elseif numpix * bpp == 32 && abits == 32 - pixldst 4, st1, 32, basereg, mem_operand, abits -.elseif numpix * bpp == 16 && abits == 16 - pixldst 2, st1, 16, basereg, mem_operand, abits +.if \bpp > 0 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits +.elseif (\bpp == 24) && (\numpix == 8) + pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand +.elseif (\bpp == 24) && (\numpix == 4) + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand +.elseif (\bpp == 24) && (\numpix == 2) + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand +.elseif (\bpp == 24) && (\numpix == 1) + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand +.elseif \numpix * \bpp == 32 && \abits == 32 + pixldst 4, st1, 32, \basereg, \mem_operand, \abits +.elseif \numpix * \bpp == 16 && \abits == 16 + pixldst 2, st1, 16, \basereg, \mem_operand, \abits .else - pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits + pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits .endif .endif .endm .macro pixld_a numpix, bpp, basereg, mem_operand -.if (bpp * numpix) <= 128 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.if (\bpp * \numpix) <= 128 + pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) .else - pixld numpix, bpp, basereg, mem_operand, 128 + pixld \numpix, \bpp, \basereg, \mem_operand, 128 .endif .endm .macro pixst_a numpix, bpp, basereg, mem_operand -.if (bpp * numpix) <= 128 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.if (\bpp * \numpix) <= 128 + pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) .else - pixst numpix, bpp, basereg, mem_operand, 128 + pixst \numpix, \bpp, \basereg, \mem_operand, 128 .endif .endm /* * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register * aliases to be defined) */ .macro pixld1_s elem_size, reg1, mem_operand -.if elem_size == 16 +.if \elem_size == 16 asr TMP1, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP1, mem_operand, TMP1, lsl #1 + add TMP1, \mem_operand, TMP1, lsl #1 asr TMP2, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP2, mem_operand, TMP2, lsl #1 - ld1 {v®1&.h}[0], [TMP1] + add TMP2, \mem_operand, TMP2, lsl #1 + ld1 {v\()\reg1\().h}[0], [TMP1] asr TMP1, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP1, mem_operand, TMP1, lsl #1 - ld1 {v®1&.h}[1], [TMP2] + add TMP1, \mem_operand, TMP1, lsl #1 + ld1 {v\()\reg1\().h}[1], [TMP2] asr TMP2, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP2, mem_operand, TMP2, lsl #1 - ld1 {v®1&.h}[2], [TMP1] - ld1 {v®1&.h}[3], [TMP2] -.elseif elem_size == 32 + add TMP2, \mem_operand, TMP2, lsl #1 + ld1 {v\()\reg1\().h}[2], [TMP1] + ld1 {v\()\reg1\().h}[3], [TMP2] +.elseif \elem_size == 32 asr TMP1, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP1, mem_operand, TMP1, lsl #2 + add TMP1, \mem_operand, TMP1, lsl #2 asr TMP2, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP2, mem_operand, TMP2, lsl #2 - ld1 {v®1&.s}[0], [TMP1] - ld1 {v®1&.s}[1], [TMP2] + add TMP2, \mem_operand, TMP2, lsl #2 + ld1 {v\()\reg1\().s}[0], [TMP1] + ld1 {v\()\reg1\().s}[1], [TMP2] .else .error "unsupported" .endif .endm .macro pixld2_s elem_size, reg1, reg2, mem_operand -.if 0 /* elem_size == 32 */ +.if 0 /* \elem_size == 32 */ mov TMP1, VX, asr #16 add VX, VX, UNIT_X, asl #1 - add TMP1, mem_operand, TMP1, asl #2 + add TMP1, \mem_operand, TMP1, asl #2 mov TMP2, VX, asr #16 sub VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, asl #2 - ld1 {v®1&.s}[0], [TMP1] + add TMP2, \mem_operand, TMP2, asl #2 + ld1 {v\()\reg1\().s}[0], [TMP1] mov TMP1, VX, asr #16 add VX, VX, UNIT_X, asl #1 - add TMP1, mem_operand, TMP1, asl #2 - ld1 {v®2&.s}[0], [TMP2, :32] + add TMP1, \mem_operand, TMP1, asl #2 + ld1 {v\()\reg2\().s}[0], [TMP2, :32] mov TMP2, VX, asr #16 add VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, asl #2 - ld1 {v®1&.s}[1], [TMP1] - ld1 {v®2&.s}[1], [TMP2] + add TMP2, \mem_operand, TMP2, asl #2 + ld1 {v\()\reg1\().s}[1], [TMP1] + ld1 {v\()\reg2\().s}[1], [TMP2] .else - pixld1_s elem_size, reg1, mem_operand - pixld1_s elem_size, reg2, mem_operand + pixld1_s \elem_size, \reg1, \mem_operand + pixld1_s \elem_size, \reg2, \mem_operand .endif .endm .macro pixld0_s elem_size, reg1, idx, mem_operand -.if elem_size == 16 +.if \elem_size == 16 asr TMP1, VX, #16 adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP1, mem_operand, TMP1, lsl #1 - ld1 {v®1&.h}[idx], [TMP1] -.elseif elem_size == 32 + add TMP1, \mem_operand, TMP1, lsl #1 + ld1 {v\()\reg1\().h}[\idx], [TMP1] +.elseif \elem_size == 32 asr DUMMY, VX, #16 mov TMP1, DUMMY adds VX, VX, UNIT_X bmi 55f 5: subs VX, VX, SRC_WIDTH_FIXED bpl 5b 55: - add TMP1, mem_operand, TMP1, lsl #2 - ld1 {v®1&.s}[idx], [TMP1] + add TMP1, \mem_operand, TMP1, lsl #2 + ld1 {v\()\reg1\().s}[\idx], [TMP1] .endif .endm .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand -.if numbytes == 32 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand - pixdeinterleave elem_size, %(basereg+4) -.elseif numbytes == 16 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand -.elseif numbytes == 8 - pixld1_s elem_size, %(basereg+1), mem_operand -.elseif numbytes == 4 - .if elem_size == 32 - pixld0_s elem_size, %(basereg+0), 1, mem_operand - .elseif elem_size == 16 - pixld0_s elem_size, %(basereg+0), 2, mem_operand - pixld0_s elem_size, %(basereg+0), 3, mem_operand +.if \numbytes == 32 + pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand + pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand + pixdeinterleave \elem_size, %(\basereg+4) +.elseif \numbytes == 16 + pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand +.elseif \numbytes == 8 + pixld1_s \elem_size, %(\basereg+1), \mem_operand +.elseif \numbytes == 4 + .if \elem_size == 32 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .elseif \elem_size == 16 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand .else - pixld0_s elem_size, %(basereg+0), 4, mem_operand - pixld0_s elem_size, %(basereg+0), 5, mem_operand - pixld0_s elem_size, %(basereg+0), 6, mem_operand - pixld0_s elem_size, %(basereg+0), 7, mem_operand + pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand .endif -.elseif numbytes == 2 - .if elem_size == 16 - pixld0_s elem_size, %(basereg+0), 1, mem_operand +.elseif \numbytes == 2 + .if \elem_size == 16 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand .else - pixld0_s elem_size, %(basereg+0), 2, mem_operand - pixld0_s elem_size, %(basereg+0), 3, mem_operand + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand .endif -.elseif numbytes == 1 - pixld0_s elem_size, %(basereg+0), 1, mem_operand +.elseif \numbytes == 1 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand .else - .error "unsupported size: numbytes" + .error "unsupported size: \numbytes" .endif .endm .macro pixld_s numpix, bpp, basereg, mem_operand -.if bpp > 0 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.if \bpp > 0 + pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand .endif .endm .macro vuzp8 reg1, reg2 umov DUMMY, v16.d[0] - uzp1 v16.8b, v®1&.8b, v®2&.8b - uzp2 v®2&.8b, v®1&.8b, v®2&.8b - mov v®1&.8b, v16.8b + uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b + uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b + mov v\()\reg1\().8b, v16.8b mov v16.d[0], DUMMY .endm .macro vzip8 reg1, reg2 umov DUMMY, v16.d[0] - zip1 v16.8b, v®1&.8b, v®2&.8b - zip2 v®2&.8b, v®1&.8b, v®2&.8b - mov v®1&.8b, v16.8b + zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b + zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b + mov v\()\reg1\().8b, v16.8b mov v16.d[0], DUMMY .endm /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ .macro pixdeinterleave bpp, basereg -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) - vuzp8 %(basereg+0), %(basereg+1) - vuzp8 %(basereg+2), %(basereg+3) - vuzp8 %(basereg+1), %(basereg+3) - vuzp8 %(basereg+0), %(basereg+2) +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vuzp8 %(\basereg+0), %(\basereg+1) + vuzp8 %(\basereg+2), %(\basereg+3) + vuzp8 %(\basereg+1), %(\basereg+3) + vuzp8 %(\basereg+0), %(\basereg+2) .endif .endm /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ .macro pixinterleave bpp, basereg -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) - vzip8 %(basereg+0), %(basereg+2) - vzip8 %(basereg+1), %(basereg+3) - vzip8 %(basereg+2), %(basereg+3) - vzip8 %(basereg+0), %(basereg+1) +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vzip8 %(\basereg+0), %(\basereg+2) + vzip8 %(\basereg+1), %(\basereg+3) + vzip8 %(\basereg+2), %(\basereg+3) + vzip8 %(\basereg+0), %(\basereg+1) .endif .endm /* * This is a macro for implementing cache preload. The main idea is that * cache preload logic is mostly independent from the rest of pixels * processing code. It starts at the top left pixel and moves forward * across pixels and can jump across scanlines. Prefetch distance is @@ -432,62 +432,62 @@ 55: * for almost zero cost! * * (*) The overhead of the prefetcher is visible when running some trivial * pixels processing like simple copy. Anyway, having prefetch is a must * when working with the graphics data. */ .macro PF a, x:vararg .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) - a x + \a \x .endif .endm .macro cache_preload std_increment, boost_increment .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) -.if std_increment != 0 - PF add PF_X, PF_X, #std_increment +.if \std_increment != 0 + PF add, PF_X, PF_X, #\std_increment .endif - PF tst PF_CTL, #0xF - PF beq 71f - PF add PF_X, PF_X, #boost_increment - PF sub PF_CTL, PF_CTL, #1 + PF tst, PF_CTL, #0xF + PF beq, 71f + PF add, PF_X, PF_X, #\boost_increment + PF sub, PF_CTL, PF_CTL, #1 71: - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W .if src_bpp_shift >= 0 - PF lsl DUMMY, PF_X, #src_bpp_shift - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] + PF lsl, DUMMY, PF_X, #src_bpp_shift + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] .endif .if dst_r_bpp != 0 - PF lsl DUMMY, PF_X, #dst_bpp_shift - PF prfm PREFETCH_MODE, [PF_DST, DUMMY] + PF lsl, DUMMY, PF_X, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] .endif .if mask_bpp_shift >= 0 - PF lsl DUMMY, PF_X, #mask_bpp_shift - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] + PF lsl, DUMMY, PF_X, #mask_bpp_shift + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] .endif - PF ble 71f - PF sub PF_X, PF_X, ORIG_W - PF subs PF_CTL, PF_CTL, #0x10 + PF ble, 71f + PF sub, PF_X, PF_X, ORIG_W + PF subs, PF_CTL, PF_CTL, #0x10 71: - PF ble 72f + PF ble, 72f .if src_bpp_shift >= 0 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF ldrsb DUMMY, [PF_SRC, DUMMY] - PF add PF_SRC, PF_SRC, #1 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb, DUMMY, [PF_SRC, DUMMY] + PF add, PF_SRC, PF_SRC, #1 .endif .if dst_r_bpp != 0 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF ldrsb DUMMY, [PF_DST, DUMMY] - PF add PF_DST, PF_DST, #1 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb, DUMMY, [PF_DST, DUMMY] + PF add, PF_DST, PF_DST, #1 .endif .if mask_bpp_shift >= 0 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift - PF ldrsb DUMMY, [PF_MASK, DUMMY] - PF add PF_MASK, PF_MASK, #1 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb, DUMMY, [PF_MASK, DUMMY] + PF add, PF_MASK, PF_MASK, #1 .endif 72: .endif .endm .macro cache_preload_simple .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) .if src_bpp > 0 @@ -516,56 +516,56 @@ 72: process_pixblock_tail, \ process_pixblock_tail_head .if dst_w_bpp != 24 tst DST_R, #0xF beq 52f .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 .irp lowbit, 1, 2, 4, 8, 16 -local skip1 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) -.if lowbit < 16 /* we don't need more than 16-byte alignment */ - tst DST_R, #lowbit + +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if \lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_R, #\lowbit beq 51f .endif - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK + pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC + pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK .if dst_r_bpp > 0 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R + pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R .else - add DST_R, DST_R, #lowbit + add DST_R, DST_R, #\lowbit .endif - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) - sub W, W, #(lowbit * 8 / dst_w_bpp) + PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) + sub W, W, #(\lowbit * 8 / dst_w_bpp) 51: .endif .endr .endif pixdeinterleave src_bpp, src_basereg pixdeinterleave mask_bpp, mask_basereg pixdeinterleave dst_r_bpp, dst_r_basereg - process_pixblock_head + \process_pixblock_head cache_preload 0, pixblock_size cache_preload_simple - process_pixblock_tail + \process_pixblock_tail pixinterleave dst_w_bpp, dst_w_basereg .irp lowbit, 1, 2, 4, 8, 16 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) -.if lowbit < 16 /* we don't need more than 16-byte alignment */ - tst DST_W, #lowbit +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if \lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_W, #\lowbit beq 51f .endif .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 - sub W, W, #(lowbit * 8 / dst_w_bpp) + sub W, W, #(\lowbit * 8 / dst_w_bpp) .endif - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W + pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 51: .endif .endr .endif 52: .endm /* @@ -587,52 +587,52 @@ 52: dst_aligned_flag, \ process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head tst W, #(pixblock_size - 1) beq 52f .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 .irp chunk_size, 16, 8, 4, 2, 1 -.if pixblock_size > chunk_size - tst W, #chunk_size +.if pixblock_size > \chunk_size + tst W, #\chunk_size beq 51f - pixld_src chunk_size, src_bpp, src_basereg, SRC - pixld chunk_size, mask_bpp, mask_basereg, MASK -.if dst_aligned_flag != 0 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R + pixld_src \chunk_size, src_bpp, src_basereg, SRC + pixld \chunk_size, mask_bpp, mask_basereg, MASK +.if \dst_aligned_flag != 0 + pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R .else - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R + pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R .endif -.if cache_preload_flag != 0 - PF add PF_X, PF_X, #chunk_size +.if \cache_preload_flag != 0 + PF add, PF_X, PF_X, #\chunk_size .endif 51: .endif .endr .endif pixdeinterleave src_bpp, src_basereg pixdeinterleave mask_bpp, mask_basereg pixdeinterleave dst_r_bpp, dst_r_basereg - process_pixblock_head -.if cache_preload_flag != 0 + \process_pixblock_head +.if \cache_preload_flag != 0 cache_preload 0, pixblock_size cache_preload_simple .endif - process_pixblock_tail + \process_pixblock_tail pixinterleave dst_w_bpp, dst_w_basereg .irp chunk_size, 16, 8, 4, 2, 1 -.if pixblock_size > chunk_size - tst W, #chunk_size +.if pixblock_size > \chunk_size + tst W, #\chunk_size beq 51f -.if dst_aligned_flag != 0 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.if \dst_aligned_flag != 0 + pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W .else - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W + pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W .endif 51: .endif .endr 52: .endm /* @@ -655,17 +655,17 @@ 52: .if (src_bpp != 24) && (src_bpp != 0) sub SRC, SRC, W, lsl #src_bpp_shift .endif .if (mask_bpp != 24) && (mask_bpp != 0) sub MASK, MASK, W, lsl #mask_bpp_shift .endif subs H, H, #1 mov DST_R, DST_W - bge start_of_loop_label + bge \start_of_loop_label .endm /* * Registers are allocated in the following way by default: * v0, v1, v2, v3 - reserved for loading source pixel data * v4, v5, v6, v7 - reserved for loading destination pixel data * v24, v25, v26, v27 - reserved for loading mask pixel data * v28, v29, v30, v31 - final destination pixel data for writeback to memory @@ -682,17 +682,17 @@ 52: process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head, \ dst_w_basereg_ = 28, \ dst_r_basereg_ = 4, \ src_basereg_ = 0, \ mask_basereg_ = 24 - pixman_asm_function fname + pixman_asm_function \fname stp x29, x30, [sp, -16]! mov x29, sp sub sp, sp, 232 /* push all registers */ sub x29, x29, 64 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 stp x8, x9, [x29, -80] stp x10, x11, [x29, -96] @@ -707,38 +707,38 @@ 52: str x28, [x29, -232] /* * Select prefetch type for this function. If prefetch distance is * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch * has to be used instead of ADVANCED. */ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT -.if prefetch_distance == 0 +.if \prefetch_distance == 0 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) + ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE .endif /* * Make some macro arguments globally visible and accessible * from other macros */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set pixblock_size, pixblock_size_ - .set dst_w_basereg, dst_w_basereg_ - .set dst_r_basereg, dst_r_basereg_ - .set src_basereg, src_basereg_ - .set mask_basereg, mask_basereg_ + .set src_bpp, \src_bpp_ + .set mask_bpp, \mask_bpp_ + .set dst_w_bpp, \dst_w_bpp_ + .set pixblock_size, \pixblock_size_ + .set dst_w_basereg, \dst_w_basereg_ + .set dst_r_basereg, \dst_r_basereg_ + .set src_basereg, \src_basereg_ + .set mask_basereg, \mask_basereg_ .macro pixld_src x:vararg - pixld x + pixld \x .endm .macro fetch_src_pixblock pixld_src pixblock_size, src_bpp, \ (src_basereg - pixblock_size * src_bpp / 64), SRC .endm /* * Assign symbolic names to registers */ @@ -805,32 +805,32 @@ 52: .elseif dst_w_bpp == 16 .set dst_bpp_shift, 1 .elseif dst_w_bpp == 8 .set dst_bpp_shift, 0 .else .error "requested dst bpp (dst_w_bpp) is not supported" .endif -.if (((flags) & FLAG_DST_READWRITE) != 0) +.if (((\flags) & FLAG_DST_READWRITE) != 0) .set dst_r_bpp, dst_w_bpp .else .set dst_r_bpp, 0 .endif -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) .set DEINTERLEAVE_32BPP_ENABLED, 1 .else .set DEINTERLEAVE_32BPP_ENABLED, 0 .endif -.if prefetch_distance < 0 || prefetch_distance > 15 - .error "invalid prefetch distance (prefetch_distance)" +.if \prefetch_distance < 0 || \prefetch_distance > 15 + .error "invalid prefetch distance (\prefetch_distance)" .endif - PF mov PF_X, #0 + PF mov, PF_X, #0 mov DST_R, DST_W .if src_bpp == 24 sub SRC_STRIDE, SRC_STRIDE, W sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 .endif .if mask_bpp == 24 sub MASK_STRIDE, MASK_STRIDE, W @@ -839,71 +839,71 @@ 52: .if dst_w_bpp == 24 sub DST_STRIDE, DST_STRIDE, W sub DST_STRIDE, DST_STRIDE, W, lsl #1 .endif /* * Setup advanced prefetcher initial state */ - PF mov PF_SRC, SRC - PF mov PF_DST, DST_R - PF mov PF_MASK, MASK - /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ - PF lsl DUMMY, H, #4 - PF mov PF_CTL, DUMMY - PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) + PF mov, PF_SRC, SRC + PF mov, PF_DST, DST_R + PF mov, PF_MASK, MASK + /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */ + PF lsl, DUMMY, H, #4 + PF mov, PF_CTL, DUMMY + PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10) - init + \init subs H, H, #1 mov ORIG_W, W blt 9f cmp W, #(pixblock_size * 2) blt 800f /* * This is the start of the pipelined loop, which if optimized for * long scanlines */ 0: - ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + ensure_destination_ptr_alignment \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ pixld_a pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - PF add PF_X, PF_X, #pixblock_size - process_pixblock_head + PF add, PF_X, PF_X, #pixblock_size + \process_pixblock_head cache_preload 0, pixblock_size cache_preload_simple subs W, W, #(pixblock_size * 2) blt 200f 100: - process_pixblock_tail_head + \process_pixblock_tail_head cache_preload_simple subs W, W, #pixblock_size bge 100b 200: - process_pixblock_tail + \process_pixblock_tail pixst_a pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W /* Process the remaining trailing pixels in the scanline */ process_trailing_pixels 1, 1, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head advance_to_next_scanline 0b - cleanup + \cleanup 1000: /* pop all registers */ sub x29, x29, 64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ldp x8, x9, [x29, -80] ldp x10, x11, [x29, -96] ldp x12, x13, [x29, -112] @@ -920,48 +920,48 @@ 1000: ret /* exit */ /* * This is the start of the loop, designed to process images with small width * (less than pixblock_size * 2 pixels). In this case neither pipelining * nor prefetch are used. */ 800: .if src_bpp_shift >= 0 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift - PF prfm PREFETCH_MODE, [SRC, DUMMY] + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift + PF prfm, PREFETCH_MODE, [SRC, DUMMY] .endif .if dst_r_bpp != 0 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift - PF prfm PREFETCH_MODE, [DST_R, DUMMY] + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift + PF prfm, PREFETCH_MODE, [DST_R, DUMMY] .endif .if mask_bpp_shift >= 0 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift - PF prfm PREFETCH_MODE, [MASK, DUMMY] + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift + PF prfm, PREFETCH_MODE, [MASK, DUMMY] .endif /* Process exactly pixblock_size pixels if needed */ tst W, #pixblock_size beq 100f pixld pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - process_pixblock_head - process_pixblock_tail + \process_pixblock_head + \process_pixblock_tail pixst pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 100: /* Process the remaining trailing pixels in the scanline */ process_trailing_pixels 0, 0, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head advance_to_next_scanline 800b 9: - cleanup + \cleanup /* pop all registers */ sub x29, x29, 64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ldp x8, x9, [x29, -80] ldp x10, x11, [x29, -96] ldp x12, x13, [x29, -112] ldp x14, x15, [x29, -128] @@ -990,17 +990,17 @@ 9: .unreq DST_STRIDE .unreq MASK_STRIDE .unreq PF_CTL .unreq PF_X .unreq PF_SRC .unreq PF_DST .unreq PF_MASK .unreq DUMMY - .endfunc + pixman_end_asm_function .endm /* * A simplified variant of function generation template for a single * scanline processing (for implementing pixman combine functions) */ .macro generate_composite_function_scanline use_nearest_scaling, \ fname, \ @@ -1014,50 +1014,50 @@ 9: process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head, \ dst_w_basereg_ = 28, \ dst_r_basereg_ = 4, \ src_basereg_ = 0, \ mask_basereg_ = 24 - pixman_asm_function fname + pixman_asm_function \fname .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE /* * Make some macro arguments globally visible and accessible * from other macros */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set pixblock_size, pixblock_size_ - .set dst_w_basereg, dst_w_basereg_ - .set dst_r_basereg, dst_r_basereg_ - .set src_basereg, src_basereg_ - .set mask_basereg, mask_basereg_ + .set src_bpp, \src_bpp_ + .set mask_bpp, \mask_bpp_ + .set dst_w_bpp, \dst_w_bpp_ + .set pixblock_size, \pixblock_size_ + .set dst_w_basereg, \dst_w_basereg_ + .set dst_r_basereg, \dst_r_basereg_ + .set src_basereg, \src_basereg_ + .set mask_basereg, \mask_basereg_ -.if use_nearest_scaling != 0 +.if \use_nearest_scaling != 0 /* * Assign symbolic names to registers for nearest scaling */ W .req x0 DST_W .req x1 SRC .req x2 VX .req x3 UNIT_X .req x4 SRC_WIDTH_FIXED .req x5 MASK .req x6 TMP1 .req x8 TMP2 .req x9 DST_R .req x10 DUMMY .req x30 .macro pixld_src x:vararg - pixld_s x + pixld_s \x .endm sxtw x0, w0 sxtw x3, w3 sxtw x4, w4 sxtw x5, w5 stp x29, x30, [sp, -16]! @@ -1075,84 +1075,84 @@ 9: W .req x0 /* width (is updated during processing) */ DST_W .req x1 /* destination buffer pointer for writes */ SRC .req x2 /* source buffer pointer */ MASK .req x3 /* mask pointer */ DST_R .req x4 /* destination buffer pointer for reads */ DUMMY .req x30 .macro pixld_src x:vararg - pixld x + pixld \x .endm sxtw x0, w0 stp x29, x30, [sp, -16]! mov x29, sp sub sp, sp, 64 sub x29, x29, 64 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 .endif -.if (((flags) & FLAG_DST_READWRITE) != 0) +.if (((\flags) & FLAG_DST_READWRITE) != 0) .set dst_r_bpp, dst_w_bpp .else .set dst_r_bpp, 0 .endif -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) .set DEINTERLEAVE_32BPP_ENABLED, 1 .else .set DEINTERLEAVE_32BPP_ENABLED, 0 .endif .macro fetch_src_pixblock pixld_src pixblock_size, src_bpp, \ (src_basereg - pixblock_size * src_bpp / 64), SRC .endm - init + \init mov DST_R, DST_W cmp W, #pixblock_size blt 800f - ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + ensure_destination_ptr_alignment \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head subs W, W, #pixblock_size blt 700f /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ pixld_a pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - process_pixblock_head + \process_pixblock_head subs W, W, #pixblock_size blt 200f 100: - process_pixblock_tail_head + \process_pixblock_tail_head subs W, W, #pixblock_size bge 100b 200: - process_pixblock_tail + \process_pixblock_tail pixst_a pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 700: /* Process the remaining trailing pixels in the scanline (dst aligned) */ process_trailing_pixels 0, 1, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head - cleanup -.if use_nearest_scaling != 0 + \cleanup +.if \use_nearest_scaling != 0 sub x29, x29, 64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ldp x8, x9, [x29, -80] ldr x10, [x29, -96] mov sp, x29 ldp x29, x30, [sp], 16 ret /* exit */ @@ -1162,22 +1162,22 @@ 700: ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 mov sp, x29 ldp x29, x30, [sp], 16 ret /* exit */ .endif 800: /* Process the remaining trailing pixels in the scanline (dst unaligned) */ process_trailing_pixels 0, 0, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head - cleanup -.if use_nearest_scaling != 0 + \cleanup +.if \use_nearest_scaling != 0 sub x29, x29, 64 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ldp x8, x9, [x29, -80] ldr x10, [x29, -88] mov sp, x29 ldp x29, x30, [sp], 16 ret /* exit */ @@ -1208,25 +1208,25 @@ 800: .unreq DST_R .unreq DST_W .unreq W .endif .purgem fetch_src_pixblock .purgem pixld_src - .endfunc + pixman_end_asm_function .endm .macro generate_composite_function_single_scanline x:vararg - generate_composite_function_scanline 0, x + generate_composite_function_scanline 0, \x .endm .macro generate_composite_function_nearest_scanline x:vararg - generate_composite_function_scanline 1, x + generate_composite_function_scanline 1, \x .endm /* Default prologue/epilogue, nothing special needs to be done */ .macro default_init .endm .macro default_cleanup @@ -1250,61 +1250,61 @@ 800: * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) * into a planar a8r8g8b8 format (with a, r, g, b color components * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). * * Warning: the conversion is destructive and the original * value (in) is lost. */ .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b - shrn &out_r&.8b, &in&.8h, #8 - shrn &out_g&.8b, &in&.8h, #3 - sli &in&.8h, &in&.8h, #5 - movi &out_a&.8b, #255 - sri &out_r&.8b, &out_r&.8b, #5 - sri &out_g&.8b, &out_g&.8b, #6 - shrn &out_b&.8b, &in&.8h, #2 + shrn \()\out_r\().8b, \()\in\().8h, #8 + shrn \()\out_g\().8b, \()\in\().8h, #3 + sli \()\in\().8h, \()\in\().8h, #5 + movi \()\out_a\().8b, #255 + sri \()\out_r\().8b, \()\out_r\().8b, #5 + sri \()\out_g\().8b, \()\out_g\().8b, #6 + shrn \()\out_b\().8b, \()\in\().8h, #2 .endm .macro convert_0565_to_x888 in, out_r, out_g, out_b - shrn &out_r&.8b, &in&.8h, #8 - shrn &out_g&.8b, &in&.8h, #3 - sli &in&.8h, &in&.8h, #5 - sri &out_r&.8b, &out_r&.8b, #5 - sri &out_g&.8b, &out_g&.8b, #6 - shrn &out_b&.8b, &in&.8h, #2 + shrn \()\out_r\().8b, \()\in\().8h, #8 + shrn \()\out_g\().8b, \()\in\().8h, #3 + sli \()\in\().8h, \()\in\().8h, #5 + sri \()\out_r\().8b, \()\out_r\().8b, #5 + sri \()\out_g\().8b, \()\out_g\().8b, #6 + shrn \()\out_b\().8b, \()\in\().8h, #2 .endm /* * Conversion from planar a8r8g8b8 format (with a, r, g, b color components * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 * pixels packed in 128-bit register (out). Requires two temporary 128-bit * registers (tmp1, tmp2) */ .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 - ushll &tmp1&.8h, &in_g&.8b, #7 - shl &tmp1&.8h, &tmp1&.8h, #1 - ushll &out&.8h, &in_r&.8b, #7 - shl &out&.8h, &out&.8h, #1 - ushll &tmp2&.8h, &in_b&.8b, #7 - shl &tmp2&.8h, &tmp2&.8h, #1 - sri &out&.8h, &tmp1&.8h, #5 - sri &out&.8h, &tmp2&.8h, #11 + ushll \()\tmp1\().8h, \()\in_g\().8b, #7 + shl \()\tmp1\().8h, \()\tmp1\().8h, #1 + ushll \()\out\().8h, \()\in_r\().8b, #7 + shl \()\out\().8h, \()\out\().8h, #1 + ushll \()\tmp2\().8h, \()\in_b\().8b, #7 + shl \()\tmp2\().8h, \()\tmp2\().8h, #1 + sri \()\out\().8h, \()\tmp1\().8h, #5 + sri \()\out\().8h, \()\tmp2\().8h, #11 .endm /* * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels * returned in (out0, out1) registers pair. Requires one temporary * 64-bit register (tmp). 'out1' and 'in' may overlap, the original * value from 'in' is lost */ .macro convert_four_0565_to_x888_packed in, out0, out1, tmp - shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ - shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ - sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ - sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ - sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ - ushr &out1&.4h, &in&.4h, #8 /* R is in place */ - sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ - zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ - zip2 &out1&.4h, &out0&.4h, &out1&.4h - mov &out0&.d[0], &tmp&.d[0] + shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */ + shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */ + sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */ + sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */ + sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */ + ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */ + sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */ + zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */ + zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h + mov \()\out0\().d[0], \()\tmp\().d[0] .endm