diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 05:35:37 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 05:35:37 +0000 |
commit | a90a5cba08fdf6c0ceb95101c275108a152a3aed (patch) | |
tree | 532507288f3defd7f4dcf1af49698bcb76034855 /gfx/cairo/pixman-arm64-clang.patch | |
parent | Adding debian version 126.0.1-1. (diff) | |
download | firefox-a90a5cba08fdf6c0ceb95101c275108a152a3aed.tar.xz firefox-a90a5cba08fdf6c0ceb95101c275108a152a3aed.zip |
Merging upstream version 127.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/cairo/pixman-arm64-clang.patch')
-rw-r--r-- | gfx/cairo/pixman-arm64-clang.patch | 3756 |
1 files changed, 0 insertions, 3756 deletions
diff --git a/gfx/cairo/pixman-arm64-clang.patch b/gfx/cairo/pixman-arm64-clang.patch deleted file mode 100644 index f059734531..0000000000 --- a/gfx/cairo/pixman-arm64-clang.patch +++ /dev/null @@ -1,3756 +0,0 @@ -https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71 - -diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h ---- a/gfx/cairo/libpixman/src/pixman-arm-asm.h -+++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h -@@ -21,17 +21,33 @@ - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Jeff Muizelaar (jeff@infidigm.net) - * - */ - - /* Supplementary macro for setting function attributes */ --.macro pixman_asm_function fname -- .func fname -- .global fname -+.macro pixman_asm_function_impl fname -+#ifdef ASM_HAVE_FUNC_DIRECTIVE -+ .func \fname -+#endif -+ .global \fname - #ifdef __ELF__ -- .hidden fname -- .type fname, %function -+ .hidden \fname -+ .type \fname, %function - #endif --fname: -+\fname: - .endm -+ -+.macro pixman_asm_function fname -+#ifdef ASM_LEADING_UNDERSCORE -+ pixman_asm_function_impl _\fname -+#else -+ pixman_asm_function_impl \fname -+#endif -+.endm -+ -+.macro pixman_end_asm_function -+#ifdef ASM_HAVE_FUNC_DIRECTIVE -+ .endfunc -+#endif -+.endm -diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S ---- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S -+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S -@@ -72,219 +72,219 @@ - * format conversion, and interpolation as separate macros which can be used - * as the basic building blocks for constructing bilinear scanline functions. - */ - - .macro bilinear_load_8888 reg1, reg2, tmp - asr WTMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 -- ld1 {®1&.2s}, [TMP1], STRIDE -- ld1 {®2&.2s}, [TMP1] -+ ld1 {\()\reg1\().2s}, [TMP1], STRIDE -+ ld1 {\()\reg2\().2s}, [TMP1] - .endm - - .macro bilinear_load_0565 reg1, reg2, tmp - asr WTMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 -- ld1 {®2&.s}[0], [TMP1], STRIDE -- ld1 {®2&.s}[1], [TMP1] -- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp -+ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE -+ ld1 {\()\reg2\().s}[1], [TMP1] -+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp - .endm - - .macro bilinear_load_and_vertical_interpolate_two_8888 \ - acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - -- bilinear_load_8888 reg1, reg2, tmp1 -- umull &acc1&.8h, ®1&.8b, v28.8b -- umlal &acc1&.8h, ®2&.8b, v29.8b -- bilinear_load_8888 reg3, reg4, tmp2 -- umull &acc2&.8h, ®3&.8b, v28.8b -- umlal &acc2&.8h, ®4&.8b, v29.8b -+ bilinear_load_8888 \reg1, \reg2, \tmp1 -+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b -+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b -+ bilinear_load_8888 \reg3, \reg4, \tmp2 -+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b -+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b - .endm - - .macro bilinear_load_and_vertical_interpolate_four_8888 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ -+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - bilinear_load_and_vertical_interpolate_two_8888 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi -+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi - bilinear_load_and_vertical_interpolate_two_8888 \ -- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi -+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi - .endm - - .macro vzip reg1, reg2 -- zip1 v24.8b, reg1, reg2 -- zip2 reg2, reg1, reg2 -- mov reg1, v24.8b -+ zip1 v24.8b, \reg1, \reg2 -+ zip2 \reg2, \reg1, \reg2 -+ mov \reg1, v24.8b - .endm - - .macro vuzp reg1, reg2 -- uzp1 v24.8b, reg1, reg2 -- uzp2 reg2, reg1, reg2 -- mov reg1, v24.8b -+ uzp1 v24.8b, \reg1, \reg2 -+ uzp2 \reg2, \reg1, \reg2 -+ mov \reg1, v24.8b - .endm - - .macro bilinear_load_and_vertical_interpolate_two_0565 \ - acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi - asr WTMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 - asr WTMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #1 -- ld1 {&acc2&.s}[0], [TMP1], STRIDE -- ld1 {&acc2&.s}[2], [TMP2], STRIDE -- ld1 {&acc2&.s}[1], [TMP1] -- ld1 {&acc2&.s}[3], [TMP2] -- convert_0565_to_x888 acc2, reg3, reg2, reg1 -- vzip ®1&.8b, ®3&.8b -- vzip ®2&.8b, ®4&.8b -- vzip ®3&.8b, ®4&.8b -- vzip ®1&.8b, ®2&.8b -- umull &acc1&.8h, ®1&.8b, v28.8b -- umlal &acc1&.8h, ®2&.8b, v29.8b -- umull &acc2&.8h, ®3&.8b, v28.8b -- umlal &acc2&.8h, ®4&.8b, v29.8b -+ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE -+ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE -+ ld1 {\()\acc2\().s}[1], [TMP1] -+ ld1 {\()\acc2\().s}[3], [TMP2] -+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 -+ vzip \()\reg1\().8b, \()\reg3\().8b -+ vzip \()\reg2\().8b, \()\reg4\().8b -+ vzip \()\reg3\().8b, \()\reg4\().8b -+ vzip \()\reg1\().8b, \()\reg2\().8b -+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b -+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b -+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b -+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b - .endm - - .macro bilinear_load_and_vertical_interpolate_four_0565 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ -+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - asr WTMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 - asr WTMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #1 -- ld1 {&xacc2&.s}[0], [TMP1], STRIDE -- ld1 {&xacc2&.s}[2], [TMP2], STRIDE -- ld1 {&xacc2&.s}[1], [TMP1] -- ld1 {&xacc2&.s}[3], [TMP2] -- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 -+ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE -+ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE -+ ld1 {\()\xacc2\().s}[1], [TMP1] -+ ld1 {\()\xacc2\().s}[3], [TMP2] -+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 - asr WTMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 - asr WTMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #1 -- ld1 {&yacc2&.s}[0], [TMP1], STRIDE -- vzip &xreg1&.8b, &xreg3&.8b -- ld1 {&yacc2&.s}[2], [TMP2], STRIDE -- vzip &xreg2&.8b, &xreg4&.8b -- ld1 {&yacc2&.s}[1], [TMP1] -- vzip &xreg3&.8b, &xreg4&.8b -- ld1 {&yacc2&.s}[3], [TMP2] -- vzip &xreg1&.8b, &xreg2&.8b -- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 -- umull &xacc1&.8h, &xreg1&.8b, v28.8b -- vzip &yreg1&.8b, &yreg3&.8b -- umlal &xacc1&.8h, &xreg2&.8b, v29.8b -- vzip &yreg2&.8b, &yreg4&.8b -- umull &xacc2&.8h, &xreg3&.8b, v28.8b -- vzip &yreg3&.8b, &yreg4&.8b -- umlal &xacc2&.8h, &xreg4&.8b, v29.8b -- vzip &yreg1&.8b, &yreg2&.8b -- umull &yacc1&.8h, &yreg1&.8b, v28.8b -- umlal &yacc1&.8h, &yreg2&.8b, v29.8b -- umull &yacc2&.8h, &yreg3&.8b, v28.8b -- umlal &yacc2&.8h, &yreg4&.8b, v29.8b -+ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE -+ vzip \()\xreg1\().8b, \()\xreg3\().8b -+ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE -+ vzip \()\xreg2\().8b, \()\xreg4\().8b -+ ld1 {\()\yacc2\().s}[1], [TMP1] -+ vzip \()\xreg3\().8b, \()\xreg4\().8b -+ ld1 {\()\yacc2\().s}[3], [TMP2] -+ vzip \()\xreg1\().8b, \()\xreg2\().8b -+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 -+ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b -+ vzip \()\yreg1\().8b, \()\yreg3\().8b -+ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b -+ vzip \()\yreg2\().8b, \()\yreg4\().8b -+ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b -+ vzip \()\yreg3\().8b, \()\yreg4\().8b -+ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b -+ vzip \()\yreg1\().8b, \()\yreg2\().8b -+ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b -+ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b -+ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b -+ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b - .endm - - .macro bilinear_store_8888 numpix, tmp1, tmp2 --.if numpix == 4 -+.if \numpix == 4 - st1 {v0.2s, v1.2s}, [OUT], #16 --.elseif numpix == 2 -+.elseif \numpix == 2 - st1 {v0.2s}, [OUT], #8 --.elseif numpix == 1 -+.elseif \numpix == 1 - st1 {v0.s}[0], [OUT], #4 - .else -- .error bilinear_store_8888 numpix is unsupported -+ .error bilinear_store_8888 \numpix is unsupported - .endif - .endm - - .macro bilinear_store_0565 numpix, tmp1, tmp2 - vuzp v0.8b, v1.8b - vuzp v2.8b, v3.8b - vuzp v1.8b, v3.8b - vuzp v0.8b, v2.8b -- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 --.if numpix == 4 -+ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 -+.if \numpix == 4 - st1 {v1.4h}, [OUT], #8 --.elseif numpix == 2 -+.elseif \numpix == 2 - st1 {v1.s}[0], [OUT], #4 --.elseif numpix == 1 -+.elseif \numpix == 1 - st1 {v1.h}[0], [OUT], #2 - .else -- .error bilinear_store_0565 numpix is unsupported -+ .error bilinear_store_0565 \numpix is unsupported - .endif - .endm - - - /* - * Macros for loading mask pixels into register 'mask'. - * dup must be done in somewhere else. - */ - .macro bilinear_load_mask_x numpix, mask - .endm - - .macro bilinear_load_mask_8 numpix, mask --.if numpix == 4 -- ld1 {&mask&.s}[0], [MASK], #4 --.elseif numpix == 2 -- ld1 {&mask&.h}[0], [MASK], #2 --.elseif numpix == 1 -- ld1 {&mask&.b}[0], [MASK], #1 -+.if \numpix == 4 -+ ld1 {\()\mask\().s}[0], [MASK], #4 -+.elseif \numpix == 2 -+ ld1 {\()\mask\().h}[0], [MASK], #2 -+.elseif \numpix == 1 -+ ld1 {\()\mask\().b}[0], [MASK], #1 - .else -- .error bilinear_load_mask_8 numpix is unsupported -+ .error bilinear_load_mask_8 \numpix is unsupported - .endif -- prfm PREFETCH_MODE, [MASK, #prefetch_offset] -+ prfum PREFETCH_MODE, [MASK, #(prefetch_offset)] - .endm - - .macro bilinear_load_mask mask_fmt, numpix, mask -- bilinear_load_mask_&mask_fmt numpix, mask -+ bilinear_load_mask_\mask_fmt \numpix, \mask - .endm - - - /* - * Macros for loading destination pixels into register 'dst0' and 'dst1'. - * Interleave should be done somewhere else. - */ - .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 --.if numpix == 4 -- ld1 {&dst0&.2s, &dst1&.2s}, [OUT] --.elseif numpix == 2 -- ld1 {&dst0&.2s}, [OUT] --.elseif numpix == 1 -- ld1 {&dst0&.s}[0], [OUT] -+.if \numpix == 4 -+ ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT] -+.elseif \numpix == 2 -+ ld1 {\()\dst0\().2s}, [OUT] -+.elseif \numpix == 1 -+ ld1 {\()\dst0\().s}[0], [OUT] - .else -- .error bilinear_load_dst_8888 numpix is unsupported -+ .error bilinear_load_dst_8888 \numpix is unsupported - .endif -- mov &dst01&.d[0], &dst0&.d[0] -- mov &dst01&.d[1], &dst1&.d[0] -+ mov \()\dst01\().d[0], \()\dst0\().d[0] -+ mov \()\dst01\().d[1], \()\dst1\().d[0] - prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] - .endm - - .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 -- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 -+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 -- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 -+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 -- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 -+ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 - .endm - - /* - * Macros for duplicating partially loaded mask to fill entire register. - * We will apply mask to interleaved source pixels, that is - * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) - * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) - * So, we need to duplicate loaded mask into whole register. -@@ -293,84 +293,85 @@ - * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) - * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) - * We can do some optimizations for this including last pixel cases. - */ - .macro bilinear_duplicate_mask_x numpix, mask - .endm - - .macro bilinear_duplicate_mask_8 numpix, mask --.if numpix == 4 -- dup &mask&.2s, &mask&.s[0] --.elseif numpix == 2 -- dup &mask&.4h, &mask&.h[0] --.elseif numpix == 1 -- dup &mask&.8b, &mask&.b[0] -+.if \numpix == 4 -+ dup \()\mask\().2s, \()\mask\().s[0] -+.elseif \numpix == 2 -+ dup \()\mask\().4h, \()\mask\().h[0] -+.elseif \numpix == 1 -+ dup \()\mask\().8b, \()\mask\().b[0] - .else -- .error bilinear_duplicate_mask_8 is unsupported -+ .error bilinear_duplicate_\mask_8 is unsupported - .endif - .endm - - .macro bilinear_duplicate_mask mask_fmt, numpix, mask -- bilinear_duplicate_mask_&mask_fmt numpix, mask -+ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask - .endm - - /* - * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. - * Interleave should be done when maks is enabled or operator is 'over'. - */ - .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 -- vuzp &src0&.8b, &src1&.8b -- vuzp &dst0&.8b, &dst1&.8b -- vuzp &src0&.8b, &src1&.8b -- vuzp &dst0&.8b, &dst1&.8b -- mov &src01&.d[1], &src1&.d[0] -- mov &src01&.d[0], &src0&.d[0] -- mov &dst01&.d[1], &dst1&.d[0] -- mov &dst01&.d[0], &dst0&.d[0] -+ vuzp \()\src0\().8b, \()\src1\().8b -+ vuzp \()\dst0\().8b, \()\dst1\().8b -+ vuzp \()\src0\().8b, \()\src1\().8b -+ vuzp \()\dst0\().8b, \()\dst1\().8b -+ mov \()\src01\().d[1], \()\src1\().d[0] -+ mov \()\src01\().d[0], \()\src0\().d[0] -+ mov \()\dst01\().d[1], \()\dst1\().d[0] -+ mov \()\dst01\().d[0], \()\dst0\().d[0] - .endm - - .macro bilinear_interleave_src_dst_x_src \ - numpix, src0, src1, src01, dst0, dst1, dst01 - .endm - - .macro bilinear_interleave_src_dst_x_over \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 -+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_interleave_src_dst_x_add \ - numpix, src0, src1, src01, dst0, dst1, dst01 -- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 -+ -+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_interleave_src_dst_8_src \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 -+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_interleave_src_dst_8_over \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 -+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_interleave_src_dst_8_add \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 -+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_interleave_src_dst \ - mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave_src_dst_&mask_fmt&_&op \ -- numpix, src0, src1, src01, dst0, dst1, dst01 -+ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ -+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - - /* - * Macros for applying masks to src pixels. (see combine_mask_u() function) - * src, dst should be in interleaved form. - * mask register should be in form (m0, m1, m2, m3). - */ -@@ -378,191 +379,191 @@ - numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 - .endm - - .macro bilinear_apply_mask_to_src_8 \ - numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 - -- umull &tmp01&.8h, &src0&.8b, &mask&.8b -- umull &tmp23&.8h, &src1&.8b, &mask&.8b -+ umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b -+ umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b - /* bubbles */ -- urshr &tmp45&.8h, &tmp01&.8h, #8 -- urshr &tmp67&.8h, &tmp23&.8h, #8 -+ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 -+ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 - /* bubbles */ -- raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h -- raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h -- mov &src01&.d[0], &src0&.d[0] -- mov &src01&.d[1], &src1&.d[0] -+ raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h -+ raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h -+ mov \()\src01\().d[0], \()\src0\().d[0] -+ mov \()\src01\().d[1], \()\src1\().d[0] - .endm - - .macro bilinear_apply_mask_to_src \ - mask_fmt, numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 - -- bilinear_apply_mask_to_src_&mask_fmt \ -- numpix, src0, src1, src01, mask, \ -- tmp01, tmp23, tmp45, tmp67 -+ bilinear_apply_mask_to_src_\()\mask_fmt \ -+ \numpix, \src0, \src1, \src01, \mask, \ -+ \tmp01, \tmp23, \tmp45, \tmp67 - .endm - - - /* - * Macros for combining src and destination pixels. - * Interleave or not is depending on operator 'op'. - */ - .macro bilinear_combine_src \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - .endm - - .macro bilinear_combine_over \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - -- dup &tmp8&.2s, &src1&.s[1] -+ dup \()\tmp8\().2s, \()\src1\().s[1] - /* bubbles */ -- mvn &tmp8&.8b, &tmp8&.8b -+ mvn \()\tmp8\().8b, \()\tmp8\().8b - /* bubbles */ -- umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b -+ umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b - /* bubbles */ -- umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b -+ umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b - /* bubbles */ -- urshr &tmp45&.8h, &tmp01&.8h, #8 -- urshr &tmp67&.8h, &tmp23&.8h, #8 -+ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 -+ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 - /* bubbles */ -- raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h -- raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h -- mov &dst01&.d[0], &dst0&.d[0] -- mov &dst01&.d[1], &dst1&.d[0] -+ raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h -+ raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h -+ mov \()\dst01\().d[0], \()\dst0\().d[0] -+ mov \()\dst01\().d[1], \()\dst1\().d[0] - /* bubbles */ -- uqadd &src0&.8b, &dst0&.8b, &src0&.8b -- uqadd &src1&.8b, &dst1&.8b, &src1&.8b -- mov &src01&.d[0], &src0&.d[0] -- mov &src01&.d[1], &src1&.d[0] -+ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b -+ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b -+ mov \()\src01\().d[0], \()\src0\().d[0] -+ mov \()\src01\().d[1], \()\src1\().d[0] - .endm - - .macro bilinear_combine_add \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - -- uqadd &src0&.8b, &dst0&.8b, &src0&.8b -- uqadd &src1&.8b, &dst1&.8b, &src1&.8b -- mov &src01&.d[0], &src0&.d[0] -- mov &src01&.d[1], &src1&.d[0] -+ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b -+ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b -+ mov \()\src01\().d[0], \()\src0\().d[0] -+ mov \()\src01\().d[1], \()\src1\().d[0] - .endm - - .macro bilinear_combine \ - op, numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - -- bilinear_combine_&op \ -- numpix, src0, src1, src01, dst0, dst1, dst01, \ -- tmp01, tmp23, tmp45, tmp67, tmp8 -+ bilinear_combine_\()\op \ -+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ -+ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 - .endm - - /* - * Macros for final deinterleaving of destination pixels if needed. - */ - .macro bilinear_deinterleave numpix, dst0, dst1, dst01 -- vuzp &dst0&.8b, &dst1&.8b -+ vuzp \()\dst0\().8b, \()\dst1\().8b - /* bubbles */ -- vuzp &dst0&.8b, &dst1&.8b -- mov &dst01&.d[0], &dst0&.d[0] -- mov &dst01&.d[1], &dst1&.d[0] -+ vuzp \()\dst0\().8b, \()\dst1\().8b -+ mov \()\dst01\().d[0], \()\dst0\().d[0] -+ mov \()\dst01\().d[1], \()\dst1\().d[0] - .endm - - .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 -- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 -+ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 - .endm - - - .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op -- bilinear_load_&src_fmt v0, v1, v2 -- bilinear_load_mask mask_fmt, 1, v4 -- bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 -+ bilinear_load_\()\src_fmt v0, v1, v2 -+ bilinear_load_mask \mask_fmt, 1, v4 -+ bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9 - umull v2.8h, v0.8b, v28.8b - umlal v2.8h, v1.8b, v29.8b - /* 5 cycles bubble */ - ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v0.4s, v2.4h, v15.h[0] - umlal2 v0.4s, v2.8h, v15.h[0] - /* 5 cycles bubble */ -- bilinear_duplicate_mask mask_fmt, 1, v4 -+ bilinear_duplicate_mask \mask_fmt, 1, v4 - shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - /* 3 cycles bubble */ - xtn v0.8b, v0.8h - /* 1 cycle bubble */ - bilinear_interleave_src_dst \ -- mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 -+ \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9 - bilinear_apply_mask_to_src \ -- mask_fmt, 1, v0, v1, v0, v4, \ -+ \mask_fmt, 1, v0, v1, v0, v4, \ - v3, v8, v10, v11 - bilinear_combine \ -- op, 1, v0, v1, v0, v18, v19, v9, \ -+ \op, 1, v0, v1, v0, v18, v19, v9, \ - v3, v8, v10, v11, v5 -- bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 -- bilinear_store_&dst_fmt 1, v17, v18 -+ bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0 -+ bilinear_store_\()\dst_fmt 1, v17, v18 - .endm - - .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op -- bilinear_load_and_vertical_interpolate_two_&src_fmt \ -+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ - v1, v11, v18, v19, v20, v21, v22, v23 -- bilinear_load_mask mask_fmt, 2, v4 -- bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 -+ bilinear_load_mask \mask_fmt, 2, v4 -+ bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9 - ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v0.4s, v1.4h, v15.h[0] - umlal2 v0.4s, v1.8h, v15.h[0] - ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v10.4s, v11.4h, v15.h[4] - umlal2 v10.4s, v11.8h, v15.h[4] - shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) -- bilinear_duplicate_mask mask_fmt, 2, v4 -+ bilinear_duplicate_mask \mask_fmt, 2, v4 - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - add v12.8h, v12.8h, v13.8h - xtn v0.8b, v0.8h - bilinear_interleave_src_dst \ -- mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 -+ \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9 - bilinear_apply_mask_to_src \ -- mask_fmt, 2, v0, v1, v0, v4, \ -+ \mask_fmt, 2, v0, v1, v0, v4, \ - v3, v8, v10, v11 - bilinear_combine \ -- op, 2, v0, v1, v0, v18, v19, v9, \ -+ \op, 2, v0, v1, v0, v18, v19, v9, \ - v3, v8, v10, v11, v5 -- bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 -- bilinear_store_&dst_fmt 2, v16, v17 -+ bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0 -+ bilinear_store_\()\dst_fmt 2, v16, v17 - .endm - - .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op -- bilinear_load_and_vertical_interpolate_four_&src_fmt \ -- v1, v11, v4, v5, v6, v7, v22, v23 \ -+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ -+ v1, v11, v4, v5, v6, v7, v22, v23, \ - v3, v9, v16, v17, v20, v21, v18, v19 - prfm PREFETCH_MODE, [TMP1, PF_OFFS] - sub TMP1, TMP1, STRIDE - prfm PREFETCH_MODE, [TMP1, PF_OFFS] - ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v0.4s, v1.4h, v15.h[0] - umlal2 v0.4s, v1.8h, v15.h[0] - ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS -@@ -575,33 +576,33 @@ - ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v8.4s, v9.4h, v15.h[4] - umlal2 v8.4s, v9.8h, v15.h[4] - add v12.8h, v12.8h, v13.8h - shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) -- bilinear_load_mask mask_fmt, 4, v4 -- bilinear_duplicate_mask mask_fmt, 4, v4 -+ bilinear_load_mask \mask_fmt, 4, v4 -+ bilinear_duplicate_mask \mask_fmt, 4, v4 - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - xtn v0.8b, v0.8h - xtn v1.8b, v2.8h - add v12.8h, v12.8h, v13.8h -- bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 -+ bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21 - bilinear_interleave_src_dst \ -- mask_fmt, op, 4, v0, v1, v0, v2, v3, v11 -+ \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11 - bilinear_apply_mask_to_src \ -- mask_fmt, 4, v0, v1, v0, v4, \ -+ \mask_fmt, 4, v0, v1, v0, v4, \ - v6, v8, v9, v10 - bilinear_combine \ -- op, 4, v0, v1, v0, v2, v3, v1, \ -+ \op, 4, v0, v1, v0, v2, v3, v1, \ - v6, v8, v9, v10, v23 -- bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 -- bilinear_store_&dst_fmt 4, v6, v7 -+ bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0 -+ bilinear_store_\()\dst_fmt 4, v6, v7 - .endm - - .set BILINEAR_FLAG_USE_MASK, 1 - .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 - - /* - * Main template macro for generating NEON optimized bilinear scanline functions. - * -@@ -631,24 +632,24 @@ - bilinear_process_four_pixels, \ - bilinear_process_pixblock_head, \ - bilinear_process_pixblock_tail, \ - bilinear_process_pixblock_tail_head, \ - pixblock_size, \ - prefetch_distance, \ - flags - --pixman_asm_function fname --.if pixblock_size == 8 --.elseif pixblock_size == 4 -+pixman_asm_function \fname -+.if \pixblock_size == 8 -+.elseif \pixblock_size == 4 - .else - .error unsupported pixblock size - .endif - --.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 -+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 - OUT .req x0 - TOP .req x1 - BOTTOM .req x2 - WT .req x3 - WWT .req w3 - WB .req x4 - WWB .req w4 - X .req w5 -@@ -694,32 +695,32 @@ pixman_asm_function fname - PF_OFFS .req x12 - TMP3 .req x13 - WTMP3 .req w13 - TMP4 .req x14 - WTMP4 .req w14 - STRIDE .req x15 - DUMMY .req x30 - -- .set prefetch_offset, prefetch_distance -+ .set prefetch_offset, \prefetch_distance - - stp x29, x30, [sp, -16]! - mov x29, sp - sub x29, x29, 64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - stp x10, x11, [x29, -80] - stp x12, x13, [x29, -96] - stp x14, x15, [x29, -112] - str x8, [x29, -120] - ldr w8, [x29, 16] - sub sp, sp, 120 - .endif - -- mov WTMP1, #prefetch_distance -+ mov WTMP1, #\prefetch_distance - umull PF_OFFS, WTMP1, UX - - sub STRIDE, BOTTOM, TOP - .unreq BOTTOM - - cmp WIDTH, #0 - ble 300f - -@@ -730,73 +731,73 @@ pixman_asm_function fname - mov v25.d[0], v12.d[1] - mov v26.d[0], v13.d[0] - add v25.4h, v25.4h, v26.4h - mov v12.d[1], v25.d[0] - - /* ensure good destination alignment */ - cmp WIDTH, #1 - blt 100f -- tst OUT, #(1 << dst_bpp_shift) -+ tst OUT, #(1 << \dst_bpp_shift) - beq 100f - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - add v12.8h, v12.8h, v13.8h -- bilinear_process_last_pixel -+ \bilinear_process_last_pixel - sub WIDTH, WIDTH, #1 - 100: - add v13.8h, v13.8h, v13.8h - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - add v12.8h, v12.8h, v13.8h - - cmp WIDTH, #2 - blt 100f -- tst OUT, #(1 << (dst_bpp_shift + 1)) -+ tst OUT, #(1 << (\dst_bpp_shift + 1)) - beq 100f -- bilinear_process_two_pixels -+ \bilinear_process_two_pixels - sub WIDTH, WIDTH, #2 - 100: --.if pixblock_size == 8 -+.if \pixblock_size == 8 - cmp WIDTH, #4 - blt 100f -- tst OUT, #(1 << (dst_bpp_shift + 2)) -+ tst OUT, #(1 << (\dst_bpp_shift + 2)) - beq 100f -- bilinear_process_four_pixels -+ \bilinear_process_four_pixels - sub WIDTH, WIDTH, #4 - 100: - .endif -- subs WIDTH, WIDTH, #pixblock_size -+ subs WIDTH, WIDTH, #\pixblock_size - blt 100f -- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) -- bilinear_process_pixblock_head -- subs WIDTH, WIDTH, #pixblock_size -+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) -+ \bilinear_process_pixblock_head -+ subs WIDTH, WIDTH, #\pixblock_size - blt 500f - 0: -- bilinear_process_pixblock_tail_head -- subs WIDTH, WIDTH, #pixblock_size -+ \bilinear_process_pixblock_tail_head -+ subs WIDTH, WIDTH, #\pixblock_size - bge 0b - 500: -- bilinear_process_pixblock_tail -+ \bilinear_process_pixblock_tail - 100: --.if pixblock_size == 8 -+.if \pixblock_size == 8 - tst WIDTH, #4 - beq 200f -- bilinear_process_four_pixels -+ \bilinear_process_four_pixels - 200: - .endif - /* handle the remaining trailing pixels */ - tst WIDTH, #2 - beq 200f -- bilinear_process_two_pixels -+ \bilinear_process_two_pixels - 200: - tst WIDTH, #1 - beq 300f -- bilinear_process_last_pixel -+ \bilinear_process_last_pixel - 300: - --.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 -+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 - sub x29, x29, 64 - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - ldp x10, x11, [x29, -80] - ldp x12, x13, [x29, -96] - ldp x14, x15, [x29, -112] - mov sp, x29 - ldp x29, x30, [sp], 16 -@@ -824,21 +825,21 @@ 300: - .unreq WIDTH - .unreq TMP1 - .unreq WTMP1 - .unreq TMP2 - .unreq PF_OFFS - .unreq TMP3 - .unreq TMP4 - .unreq STRIDE --.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 -+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 - .unreq MASK - .endif - --.endfunc -+pixman_end_asm_function - - .endm - - /* src_8888_8_8888 */ - .macro bilinear_src_8888_8_8888_process_last_pixel - bilinear_interpolate_last_pixel 8888, 8, 8888, src - .endm - -diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S ---- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S -+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S -@@ -262,64 +262,64 @@ - uqadd v18.8b, v0.8b, v22.8b - uqadd v19.8b, v1.8b, v23.8b - shrn v6.8b, v4.8h, #8 - fetch_src_pixblock - shrn v7.8b, v4.8h, #3 - sli v4.8h, v4.8h, #5 - ushll v14.8h, v17.8b, #7 - sli v14.8h, v14.8h, #1 -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - ushll v8.8h, v19.8b, #7 - sli v8.8h, v8.8h, #1 -- PF tst PF_CTL, #0xF -+ PF tst, PF_CTL, #0xF - sri v6.8b, v6.8b, #5 -- PF beq 10f -- PF add PF_X, PF_X, #8 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 - 10: - mvn v3.8b, v3.8b -- PF beq 10f -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - sri v7.8b, v7.8b, #6 - shrn v30.8b, v4.8h, #2 - umull v10.8h, v3.8b, v6.8b -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] - umull v11.8h, v3.8b, v7.8b - umull v12.8h, v3.8b, v30.8b -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - sri v14.8h, v8.8h, #5 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - ushll v9.8h, v18.8b, #7 - sli v9.8h, v9.8h, #1 - urshr v17.8h, v10.8h, #8 -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - urshr v19.8h, v11.8h, #8 - urshr v18.8h, v12.8h, #8 -- PF ble 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - sri v14.8h, v9.8h, #11 - mov v28.d[0], v14.d[0] - mov v29.d[0], v14.d[1] -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - 10: - raddhn v20.8b, v10.8h, v17.8h - raddhn v23.8b, v11.8h, v19.8h -- PF ble 10f -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_SRC, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_SRC, #1 - 10: - raddhn v22.8b, v12.8h, v18.8h - st1 {v14.8h}, [DST_W], #16 - .endm - - #else - - /* If we did not care much about the performance, we would just use this... */ -@@ -469,42 +469,42 @@ generate_composite_function \ - sri v14.8h, v8.8h, #5 - sri v14.8h, v9.8h, #11 - mov v28.d[0], v14.d[0] - mov v29.d[0], v14.d[1] - .endm - - .macro pixman_composite_src_8888_0565_process_pixblock_tail_head - sri v14.8h, v8.8h, #5 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - fetch_src_pixblock -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - sri v14.8h, v9.8h, #11 - mov v28.d[0], v14.d[0] - mov v29.d[0], v14.d[1] -- PF cmp PF_X, ORIG_W -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF cmp, PF_X, ORIG_W -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] - ushll v8.8h, v1.8b, #7 - sli v8.8h, v8.8h, #1 - st1 {v14.8h}, [DST_W], #16 -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - ushll v14.8h, v2.8b, #7 - sli v14.8h, v14.8h, #1 -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - 10: - ushll v9.8h, v0.8b, #7 - sli v9.8h, v9.8h, #1 - .endm - - generate_composite_function \ - pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ -@@ -561,41 +561,41 @@ generate_composite_function \ - uqadd v31.8b, v3.8b, v7.8b - .endm - - .macro pixman_composite_add_8_8_process_pixblock_tail - .endm - - .macro pixman_composite_add_8_8_process_pixblock_tail_head - fetch_src_pixblock -- PF add PF_X, PF_X, #32 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #32 -+ PF tst, PF_CTL, #0xF - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 -- PF beq 10f -- PF add PF_X, PF_X, #32 -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #32 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF cmp PF_X, ORIG_W -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF cmp, PF_X, ORIG_W -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - uqadd v28.8b, v0.8b, v4.8b -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - uqadd v29.8b, v1.8b, v5.8b - uqadd v30.8b, v2.8b, v6.8b - uqadd v31.8b, v3.8b, v7.8b - .endm - - generate_composite_function \ - pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ -@@ -607,41 +607,41 @@ generate_composite_function \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8_8_process_pixblock_tail_head - - /******************************************************************************/ - - .macro pixman_composite_add_8888_8888_process_pixblock_tail_head - fetch_src_pixblock -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF cmp PF_X, ORIG_W -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF cmp, PF_X, ORIG_W -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - uqadd v28.8b, v0.8b, v4.8b -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - uqadd v29.8b, v1.8b, v5.8b - uqadd v30.8b, v2.8b, v6.8b - uqadd v31.8b, v3.8b, v7.8b - .endm - - generate_composite_function \ - pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ -@@ -684,55 +684,55 @@ generate_composite_function_single_scanl - raddhn v29.8b, v15.8h, v9.8h - raddhn v30.8b, v16.8h, v10.8h - raddhn v31.8b, v17.8h, v11.8h - .endm - - .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 - urshr v14.8h, v8.8h, #8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - urshr v15.8h, v9.8h, #8 - urshr v16.8h, v10.8h, #8 - urshr v17.8h, v11.8h, #8 -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - raddhn v28.8b, v14.8h, v8.8h - raddhn v29.8b, v15.8h, v9.8h -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - raddhn v30.8b, v16.8h, v10.8h - raddhn v31.8b, v17.8h, v11.8h - fetch_src_pixblock -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] - mvn v22.8b, v3.8b -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v8.8h, v22.8b, v4.8b -- PF ble 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - umull v9.8h, v22.8b, v5.8b -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - 10: - umull v10.8h, v22.8b, v6.8b -- PF ble 10f -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - umull v11.8h, v22.8b, v7.8b - .endm - - generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ -@@ -754,59 +754,59 @@ generate_composite_function_single_scanl - uqadd v29.8b, v1.8b, v29.8b - uqadd v30.8b, v2.8b, v30.8b - uqadd v31.8b, v3.8b, v31.8b - .endm - - .macro pixman_composite_over_8888_8888_process_pixblock_tail_head - ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 - urshr v14.8h, v8.8h, #8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - urshr v15.8h, v9.8h, #8 - urshr v16.8h, v10.8h, #8 - urshr v17.8h, v11.8h, #8 -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - raddhn v28.8b, v14.8h, v8.8h - raddhn v29.8b, v15.8h, v9.8h -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - raddhn v30.8b, v16.8h, v10.8h - raddhn v31.8b, v17.8h, v11.8h - uqadd v28.8b, v0.8b, v28.8b - uqadd v29.8b, v1.8b, v29.8b - uqadd v30.8b, v2.8b, v30.8b - uqadd v31.8b, v3.8b, v31.8b - fetch_src_pixblock -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] - mvn v22.8b, v3.8b -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v8.8h, v22.8b, v4.8b -- PF ble 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - umull v9.8h, v22.8b, v5.8b -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - 10: - umull v10.8h, v22.8b, v6.8b -- PF ble 10f -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - umull v11.8h, v22.8b, v7.8b - .endm - - generate_composite_function \ - pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ -@@ -860,40 +860,40 @@ generate_composite_function_single_scanl - urshr v16.8h, v10.8h, #8 - urshr v17.8h, v11.8h, #8 - raddhn v28.8b, v14.8h, v8.8h - raddhn v29.8b, v15.8h, v9.8h - raddhn v30.8b, v16.8h, v10.8h - raddhn v31.8b, v17.8h, v11.8h - ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 - uqadd v28.8b, v0.8b, v28.8b -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0x0F -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0x0F -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - uqadd v29.8b, v1.8b, v29.8b - uqadd v30.8b, v2.8b, v30.8b - uqadd v31.8b, v3.8b, v31.8b -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - umull v8.8h, v24.8b, v4.8b -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - umull v9.8h, v24.8b, v5.8b -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v10.8h, v24.8b, v6.8b -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF subs, PF_CTL, PF_CTL, #0x10 - umull v11.8h, v24.8b, v7.8b -- PF ble 10f -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - .endm - - .macro pixman_composite_over_n_8888_init - mov v3.s[0], w4 - dup v0.8b, v3.b[0] - dup v1.8b, v3.b[1] -@@ -912,52 +912,52 @@ generate_composite_function \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_n_8888_process_pixblock_tail_head - - /******************************************************************************/ - - .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head - urshr v14.8h, v8.8h, #8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - urshr v15.8h, v9.8h, #8 - urshr v12.8h, v10.8h, #8 - urshr v13.8h, v11.8h, #8 -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - raddhn v28.8b, v14.8h, v8.8h - raddhn v29.8b, v15.8h, v9.8h -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - raddhn v30.8b, v12.8h, v10.8h - raddhn v31.8b, v13.8h, v11.8h - uqadd v28.8b, v0.8b, v28.8b - uqadd v29.8b, v1.8b, v29.8b - uqadd v30.8b, v2.8b, v30.8b - uqadd v31.8b, v3.8b, v31.8b - ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 - mvn v22.8b, v3.8b -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF blt 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF blt, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v8.8h, v22.8b, v4.8b -- PF blt 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF blt, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - umull v9.8h, v22.8b, v5.8b - umull v10.8h, v22.8b, v6.8b -- PF blt 10f -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF blt, 10f -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - umull v11.8h, v22.8b, v7.8b - .endm - - .macro pixman_composite_over_reverse_n_8888_init - mov v7.s[0], w4 - dup v4.8b, v7.b[0] - dup v5.8b, v7.b[1] -@@ -1405,45 +1405,45 @@ generate_composite_function \ - rshrn v28.8b, v8.8h, #8 - rshrn v29.8b, v9.8h, #8 - rshrn v30.8b, v10.8h, #8 - rshrn v31.8b, v11.8h, #8 - .endm - - .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head - fetch_mask_pixblock -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - rshrn v28.8b, v8.8h, #8 -- PF tst PF_CTL, #0x0F -+ PF tst, PF_CTL, #0x0F - rshrn v29.8b, v9.8h, #8 -- PF beq 10f -- PF add PF_X, PF_X, #8 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 - 10: - rshrn v30.8b, v10.8h, #8 -- PF beq 10f -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - rshrn v31.8b, v11.8h, #8 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - umull v8.8h, v24.8b, v0.8b -- PF lsl DUMMY, PF_X, #mask_bpp_shift -- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] -+ PF lsl, DUMMY, PF_X, #mask_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] - umull v9.8h, v24.8b, v1.8b -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v10.8h, v24.8b, v2.8b -- PF ble 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - umull v11.8h, v24.8b, v3.8b -- PF ble 10f -- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift -- PF ldrsb DUMMY, [PF_MASK, DUMMY] -- PF add PF_MASK, PF_MASK, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift -+ PF ldrsb, DUMMY, [PF_MASK, DUMMY] -+ PF add, PF_MASK, PF_MASK, #1 - 10: - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - ursra v8.8h, v8.8h, #8 - ursra v9.8h, v9.8h, #8 - ursra v10.8h, v10.8h, #8 - ursra v11.8h, v11.8h, #8 - .endm - -@@ -1486,45 +1486,45 @@ generate_composite_function \ - rshrn v28.8b, v0.8h, #8 - rshrn v29.8b, v1.8h, #8 - rshrn v30.8b, v2.8h, #8 - rshrn v31.8b, v3.8h, #8 - .endm - - .macro pixman_composite_src_n_8_8_process_pixblock_tail_head - fetch_mask_pixblock -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - rshrn v28.8b, v0.8h, #8 -- PF tst PF_CTL, #0x0F -+ PF tst, PF_CTL, #0x0F - rshrn v29.8b, v1.8h, #8 -- PF beq 10f -- PF add PF_X, PF_X, #8 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 - 10: - rshrn v30.8b, v2.8h, #8 -- PF beq 10f -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - rshrn v31.8b, v3.8h, #8 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - umull v0.8h, v24.8b, v16.8b -- PF lsl DUMMY, PF_X, mask_bpp_shift -- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] -+ PF lsl, DUMMY, PF_X, mask_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] - umull v1.8h, v25.8b, v16.8b -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v2.8h, v26.8b, v16.8b -- PF ble 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - umull v3.8h, v27.8b, v16.8b -- PF ble 10f -- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift -- PF ldrsb DUMMY, [PF_MASK, DUMMY] -- PF add PF_MASK, PF_MASK, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift -+ PF ldrsb, DUMMY, [PF_MASK, DUMMY] -+ PF add, PF_MASK, PF_MASK, #1 - 10: - st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 - ursra v0.8h, v0.8h, #8 - ursra v1.8h, v1.8h, #8 - ursra v2.8h, v2.8h, #8 - ursra v3.8h, v3.8h, #8 - .endm - -@@ -1594,54 +1594,54 @@ generate_composite_function \ - .endm - - .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head - urshr v16.8h, v12.8h, #8 - ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 - urshr v17.8h, v13.8h, #8 - fetch_mask_pixblock - urshr v18.8h, v14.8h, #8 -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - urshr v19.8h, v15.8h, #8 -- PF tst PF_CTL, #0x0F -+ PF tst, PF_CTL, #0x0F - raddhn v28.8b, v16.8h, v12.8h -- PF beq 10f -- PF add PF_X, PF_X, #8 -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 - 10: - raddhn v29.8b, v17.8h, v13.8h -- PF beq 10f -- PF sub PF_CTL, PF_CTL, #1 -+ PF beq, 10f -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - raddhn v30.8b, v18.8h, v14.8h -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - raddhn v31.8b, v19.8h, v15.8h -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - umull v16.8h, v24.8b, v8.8b -- PF lsl DUMMY, PF_X, #mask_bpp_shift -- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] -+ PF lsl, DUMMY, PF_X, #mask_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] - umull v17.8h, v24.8b, v9.8b -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W - 10: - umull v18.8h, v24.8b, v10.8b -- PF ble 10f -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF subs, PF_CTL, PF_CTL, #0x10 - 10: - umull v19.8h, v24.8b, v11.8b -- PF ble 10f -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - 10: - uqadd v28.8b, v0.8b, v28.8b -- PF ble 10f -- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift -- PF ldrsb DUMMY, [PF_MASK, DUMMY] -- PF add PF_MASK, PF_MASK, #1 -+ PF ble, 10f -+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift -+ PF ldrsb, DUMMY, [PF_MASK, DUMMY] -+ PF add, PF_MASK, PF_MASK, #1 - 10: - uqadd v29.8b, v1.8b, v29.8b - uqadd v30.8b, v2.8b, v30.8b - uqadd v31.8b, v3.8b, v31.8b - urshr v12.8h, v16.8h, #8 - urshr v13.8h, v17.8h, #8 - urshr v14.8h, v18.8h, #8 - urshr v15.8h, v19.8h, #8 -@@ -2407,17 +2407,17 @@ generate_composite_function \ - generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ - pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ -- pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ -+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - - /******************************************************************************/ - - .macro pixman_composite_over_8888_n_8888_process_pixblock_head -@@ -2482,31 +2482,31 @@ generate_composite_function \ - pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ -- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ -+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - - generate_composite_function_single_scanline \ - pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ -- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ -+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 12 /* mask_basereg */ - - /******************************************************************************/ - - /* TODO: expand macros and do better instructions scheduling */ -@@ -2524,17 +2524,17 @@ generate_composite_function \ - pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init_need_all_regs, \ - default_cleanup_need_all_regs, \ - pixman_composite_over_8888_n_8888_process_pixblock_head, \ - pixman_composite_over_8888_n_8888_process_pixblock_tail, \ -- pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ -+ pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \ - 28, /* dst_w_basereg */ \ - 4, /* dst_r_basereg */ \ - 0, /* src_basereg */ \ - 15 /* mask_basereg */ - - /******************************************************************************/ - - .macro pixman_composite_src_0888_0888_process_pixblock_head -@@ -2675,38 +2675,38 @@ generate_composite_function \ - urshr v11.8h, v8.8h, #8 - mov v30.8b, v31.8b - mov v31.8b, v3.8b - mov v3.8b, v31.8b - urshr v12.8h, v9.8h, #8 - urshr v13.8h, v10.8h, #8 - fetch_src_pixblock - raddhn v30.8b, v11.8h, v8.8h -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - raddhn v29.8b, v12.8h, v9.8h - raddhn v28.8b, v13.8h, v10.8h - umull v8.8h, v3.8b, v0.8b - umull v9.8h, v3.8b, v1.8b - umull v10.8h, v3.8b, v2.8b - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF cmp PF_X, ORIG_W -- PF lsl DUMMY, PF_X, src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -- PF subs PF_CTL, PF_CTL, #0x10 -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF cmp, PF_X, ORIG_W -+ PF lsl, DUMMY, PF_X, src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W -+ PF subs, PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - 10: - .endm - - generate_composite_function \ - pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ -@@ -2744,38 +2744,38 @@ generate_composite_function \ - urshr v11.8h, v8.8h, #8 - mov v30.8b, v31.8b - mov v31.8b, v3.8b - mov v3.8b, v30.8b - urshr v12.8h, v9.8h, #8 - urshr v13.8h, v10.8h, #8 - fetch_src_pixblock - raddhn v28.8b, v11.8h, v8.8h -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -- PF beq 10f -- PF add PF_X, PF_X, #8 -- PF sub PF_CTL, PF_CTL, #1 -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF -+ PF beq, 10f -+ PF add, PF_X, PF_X, #8 -+ PF sub, PF_CTL, PF_CTL, #1 - 10: - raddhn v29.8b, v12.8h, v9.8h - raddhn v30.8b, v13.8h, v10.8h - umull v8.8h, v3.8b, v0.8b - umull v9.8h, v3.8b, v1.8b - umull v10.8h, v3.8b, v2.8b - st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 -- PF cmp PF_X, ORIG_W -- PF lsl DUMMY, PF_X, src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -- PF ble 10f -- PF sub PF_X, PF_X, ORIG_W -- PF subs PF_CTL, PF_CTL, #0x10 -- PF ble 10f -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF cmp, PF_X, ORIG_W -+ PF lsl, DUMMY, PF_X, src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF ble, 10f -+ PF sub, PF_X, PF_X, ORIG_W -+ PF subs, PF_CTL, PF_CTL, #0x10 -+ PF ble, 10f -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - 10: - .endm - - generate_composite_function \ - pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ -@@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan - * format conversion, and interpolation as separate macros which can be used - * as the basic building blocks for constructing bilinear scanline functions. - */ - - .macro bilinear_load_8888 reg1, reg2, tmp - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #2 -- ld1 {®1&.2s}, [TMP1], STRIDE -- ld1 {®2&.2s}, [TMP1] -+ ld1 {\()\reg1\().2s}, [TMP1], STRIDE -+ ld1 {\()\reg2\().2s}, [TMP1] - .endm - - .macro bilinear_load_0565 reg1, reg2, tmp - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 -- ld1 {®2&.s}[0], [TMP1], STRIDE -- ld1 {®2&.s}[1], [TMP1] -- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp -+ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE -+ ld1 {\()\reg2\().s}[1], [TMP1] -+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp - .endm - - .macro bilinear_load_and_vertical_interpolate_two_8888 \ - acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - -- bilinear_load_8888 reg1, reg2, tmp1 -- umull &acc1&.8h, ®1&.8b, v28.8b -- umlal &acc1&.8h, ®2&.8b, v29.8b -- bilinear_load_8888 reg3, reg4, tmp2 -- umull &acc2&.8h, ®3&.8b, v28.8b -- umlal &acc2&.8h, ®4&.8b, v29.8b -+ bilinear_load_8888 \reg1, \reg2, \tmp1 -+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b -+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b -+ bilinear_load_8888 \reg3, \reg4, \tmp2 -+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b -+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b - .endm - - .macro bilinear_load_and_vertical_interpolate_four_8888 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ -+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - bilinear_load_and_vertical_interpolate_two_8888 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi -+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi - bilinear_load_and_vertical_interpolate_two_8888 \ -- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi -+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi - .endm - - .macro vzip reg1, reg2 - umov TMP4, v31.d[0] -- zip1 v31.8b, reg1, reg2 -- zip2 reg2, reg1, reg2 -- mov reg1, v31.8b -+ zip1 v31.8b, \reg1, \reg2 -+ zip2 \reg2, \reg1, \reg2 -+ mov \reg1, v31.8b - mov v31.d[0], TMP4 - .endm - - .macro vuzp reg1, reg2 - umov TMP4, v31.d[0] -- uzp1 v31.8b, reg1, reg2 -- uzp2 reg2, reg1, reg2 -- mov reg1, v31.8b -+ uzp1 v31.8b, \reg1, \reg2 -+ uzp2 \reg2, \reg1, \reg2 -+ mov \reg1, v31.8b - mov v31.d[0], TMP4 - .endm - - .macro bilinear_load_and_vertical_interpolate_two_0565 \ - acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #1 -- ld1 {&acc2&.s}[0], [TMP1], STRIDE -- ld1 {&acc2&.s}[2], [TMP2], STRIDE -- ld1 {&acc2&.s}[1], [TMP1] -- ld1 {&acc2&.s}[3], [TMP2] -- convert_0565_to_x888 acc2, reg3, reg2, reg1 -- vzip ®1&.8b, ®3&.8b -- vzip ®2&.8b, ®4&.8b -- vzip ®3&.8b, ®4&.8b -- vzip ®1&.8b, ®2&.8b -- umull &acc1&.8h, ®1&.8b, v28.8b -- umlal &acc1&.8h, ®2&.8b, v29.8b -- umull &acc2&.8h, ®3&.8b, v28.8b -- umlal &acc2&.8h, ®4&.8b, v29.8b -+ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE -+ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE -+ ld1 {\()\acc2\().s}[1], [TMP1] -+ ld1 {\()\acc2\().s}[3], [TMP2] -+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 -+ vzip \()\reg1\().8b, \()\reg3\().8b -+ vzip \()\reg2\().8b, \()\reg4\().8b -+ vzip \()\reg3\().8b, \()\reg4\().8b -+ vzip \()\reg1\().8b, \()\reg2\().8b -+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b -+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b -+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b -+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b - .endm - - .macro bilinear_load_and_vertical_interpolate_four_0565 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ -+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #1 -- ld1 {&xacc2&.s}[0], [TMP1], STRIDE -- ld1 {&xacc2&.s}[2], [TMP2], STRIDE -- ld1 {&xacc2&.s}[1], [TMP1] -- ld1 {&xacc2&.s}[3], [TMP2] -- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 -+ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE -+ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE -+ ld1 {\()\xacc2\().s}[1], [TMP1] -+ ld1 {\()\xacc2\().s}[3], [TMP2] -+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 - asr TMP1, X, #16 - add X, X, UX - add TMP1, TOP, TMP1, lsl #1 - asr TMP2, X, #16 - add X, X, UX - add TMP2, TOP, TMP2, lsl #1 -- ld1 {&yacc2&.s}[0], [TMP1], STRIDE -- vzip &xreg1&.8b, &xreg3&.8b -- ld1 {&yacc2&.s}[2], [TMP2], STRIDE -- vzip &xreg2&.8b, &xreg4&.8b -- ld1 {&yacc2&.s}[1], [TMP1] -- vzip &xreg3&.8b, &xreg4&.8b -- ld1 {&yacc2&.s}[3], [TMP2] -- vzip &xreg1&.8b, &xreg2&.8b -- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 -- umull &xacc1&.8h, &xreg1&.8b, v28.8b -- vzip &yreg1&.8b, &yreg3&.8b -- umlal &xacc1&.8h, &xreg2&.8b, v29.8b -- vzip &yreg2&.8b, &yreg4&.8b -- umull &xacc2&.8h, &xreg3&.8b, v28.8b -- vzip &yreg3&.8b, &yreg4&.8b -- umlal &xacc2&.8h, &xreg4&.8b, v29.8b -- vzip &yreg1&.8b, &yreg2&.8b -- umull &yacc1&.8h, &yreg1&.8b, v28.8b -- umlal &yacc1&.8h, &yreg2&.8b, v29.8b -- umull &yacc2&.8h, &yreg3&.8b, v28.8b -- umlal &yacc2&.8h, &yreg4&.8b, v29.8b -+ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE -+ vzip \()\xreg1\().8b, \()\xreg3\().8b -+ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE -+ vzip \()\xreg2\().8b, \()\xreg4\().8b -+ ld1 {\()\yacc2\().s}[1], [TMP1] -+ vzip \()\xreg3\().8b, \()\xreg4\().8b -+ ld1 {\()\yacc2\().s}[3], [TMP2] -+ vzip \()\xreg1\().8b, \()\xreg2\().8b -+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 -+ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b -+ vzip \()\yreg1\().8b, \()\yreg3\().8b -+ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b -+ vzip \()\yreg2\().8b, \()\yreg4\().8b -+ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b -+ vzip \()\yreg3\().8b, \()\yreg4\().8b -+ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b -+ vzip \()\yreg1\().8b, \()\yreg2\().8b -+ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b -+ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b -+ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b -+ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b - .endm - - .macro bilinear_store_8888 numpix, tmp1, tmp2 --.if numpix == 4 -+.if \numpix == 4 - st1 {v0.2s, v1.2s}, [OUT], #16 --.elseif numpix == 2 -+.elseif \numpix == 2 - st1 {v0.2s}, [OUT], #8 --.elseif numpix == 1 -+.elseif \numpix == 1 - st1 {v0.s}[0], [OUT], #4 - .else -- .error bilinear_store_8888 numpix is unsupported -+ .error bilinear_store_8888 \numpix is unsupported - .endif - .endm - - .macro bilinear_store_0565 numpix, tmp1, tmp2 - vuzp v0.8b, v1.8b - vuzp v2.8b, v3.8b - vuzp v1.8b, v3.8b - vuzp v0.8b, v2.8b -- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 --.if numpix == 4 -+ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 -+.if \numpix == 4 - st1 {v1.4h}, [OUT], #8 --.elseif numpix == 2 -+.elseif \numpix == 2 - st1 {v1.s}[0], [OUT], #4 --.elseif numpix == 1 -+.elseif \numpix == 1 - st1 {v1.h}[0], [OUT], #2 - .else -- .error bilinear_store_0565 numpix is unsupported -+ .error bilinear_store_0565 \numpix is unsupported - .endif - .endm - - .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt -- bilinear_load_&src_fmt v0, v1, v2 -+ bilinear_load_\()\src_fmt v0, v1, v2 - umull v2.8h, v0.8b, v28.8b - umlal v2.8h, v1.8b, v29.8b - /* 5 cycles bubble */ - ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v0.4s, v2.4h, v15.h[0] - umlal2 v0.4s, v2.8h, v15.h[0] - /* 5 cycles bubble */ - shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - /* 3 cycles bubble */ - xtn v0.8b, v0.8h - /* 1 cycle bubble */ -- bilinear_store_&dst_fmt 1, v3, v4 -+ bilinear_store_\()\dst_fmt 1, v3, v4 - .endm - - .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt -- bilinear_load_and_vertical_interpolate_two_&src_fmt \ -+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ - v1, v11, v2, v3, v20, v21, v22, v23 - ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v0.4s, v1.4h, v15.h[0] - umlal2 v0.4s, v1.8h, v15.h[0] - ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v10.4s, v11.4h, v15.h[4] - umlal2 v10.4s, v11.8h, v15.h[4] - shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - add v12.8h, v12.8h, v13.8h - xtn v0.8b, v0.8h -- bilinear_store_&dst_fmt 2, v3, v4 -+ bilinear_store_\()\dst_fmt 2, v3, v4 - .endm - - .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt -- bilinear_load_and_vertical_interpolate_four_&src_fmt \ -- v1, v11, v14, v20, v16, v17, v22, v23 \ -+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ -+ v1, v11, v14, v20, v16, v17, v22, v23, \ - v3, v9, v24, v25, v26, v27, v18, v19 - prfm PREFETCH_MODE, [TMP1, PF_OFFS] - sub TMP1, TMP1, STRIDE - ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v0.4s, v1.4h, v15.h[0] - umlal2 v0.4s, v1.8h, v15.h[0] - ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS - umlsl v10.4s, v11.4h, v15.h[4] -@@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan - shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - xtn v0.8b, v0.8h - xtn v1.8b, v2.8h - add v12.8h, v12.8h, v13.8h -- bilinear_store_&dst_fmt 4, v3, v4 -+ bilinear_store_\()\dst_fmt 4, v3, v4 - .endm - - .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head -+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head - .else -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail -+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail - .endif - .endm - - .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head -+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head - .else -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head -+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head - .else -- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail -+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail - .else -- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head -+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head - .else -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt - .endif - .endm - - .set BILINEAR_FLAG_UNROLL_4, 0 - .set BILINEAR_FLAG_UNROLL_8, 1 - .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 - - /* -@@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan - * prefetch_distance - prefetch in the source image by that many - * pixels ahead - */ - - .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ - src_bpp_shift, dst_bpp_shift, \ - prefetch_distance, flags - --pixman_asm_function fname -+pixman_asm_function \fname - OUT .req x0 - TOP .req x1 - BOTTOM .req x2 - WT .req x3 - WB .req x4 - X .req x5 - UX .req x6 - WIDTH .req x7 -@@ -3437,17 +3437,17 @@ pixman_asm_function fname - sub sp, sp, 112 /* push all registers */ - sub x29, x29, 64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 - stp x8, x9, [x29, -80] - stp x10, x11, [x29, -96] - stp x12, x13, [x29, -112] - -- mov PF_OFFS, #prefetch_distance -+ mov PF_OFFS, #\prefetch_distance - mul PF_OFFS, PF_OFFS, UX - - subs STRIDE, BOTTOM, TOP - .unreq BOTTOM - - cmp WIDTH, #0 - ble 300f - -@@ -3458,85 +3458,85 @@ pixman_asm_function fname - mov v25.d[0], v12.d[1] - mov v26.d[0], v13.d[0] - add v25.4h, v25.4h, v26.4h - mov v12.d[1], v25.d[0] - - /* ensure good destination alignment */ - cmp WIDTH, #1 - blt 100f -- tst OUT, #(1 << dst_bpp_shift) -+ tst OUT, #(1 << \dst_bpp_shift) - beq 100f - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - add v12.8h, v12.8h, v13.8h -- bilinear_interpolate_last_pixel src_fmt, dst_fmt -+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt - sub WIDTH, WIDTH, #1 - 100: - add v13.8h, v13.8h, v13.8h - ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) - add v12.8h, v12.8h, v13.8h - - cmp WIDTH, #2 - blt 100f -- tst OUT, #(1 << (dst_bpp_shift + 1)) -+ tst OUT, #(1 << (\dst_bpp_shift + 1)) - beq 100f -- bilinear_interpolate_two_pixels src_fmt, dst_fmt -+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt - sub WIDTH, WIDTH, #2 - 100: --.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 -+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 - /*********** 8 pixels per iteration *****************/ - cmp WIDTH, #4 - blt 100f -- tst OUT, #(1 << (dst_bpp_shift + 2)) -+ tst OUT, #(1 << (\dst_bpp_shift + 2)) - beq 100f -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - sub WIDTH, WIDTH, #4 - 100: - subs WIDTH, WIDTH, #8 - blt 100f -- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) -- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt -+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) -+ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #8 - blt 500f - 1000: -- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #8 - bge 1000b - 500: -- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt -+ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt - 100: - tst WIDTH, #4 - beq 200f -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - 200: - .else - /*********** 4 pixels per iteration *****************/ - subs WIDTH, WIDTH, #4 - blt 100f -- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) -- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt -+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) -+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #4 - blt 500f - 1000: -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #4 - bge 1000b - 500: -- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt - 100: - /****************************************************/ - .endif - /* handle the remaining trailing pixels */ - tst WIDTH, #2 - beq 200f -- bilinear_interpolate_two_pixels src_fmt, dst_fmt -+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt - 200: - tst WIDTH, #1 - beq 300f -- bilinear_interpolate_last_pixel src_fmt, dst_fmt -+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt - 300: - sub x29, x29, 64 - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 - ldp x8, x9, [x29, -80] - ldp x10, x11, [x29, -96] - ldp x12, x13, [x29, -104] - mov sp, x29 -@@ -3551,17 +3551,17 @@ 300: - .unreq UX - .unreq WIDTH - .unreq TMP1 - .unreq TMP2 - .unreq PF_OFFS - .unreq TMP3 - .unreq TMP4 - .unreq STRIDE --.endfunc -+pixman_end_asm_function - - .endm - - /*****************************************************************************/ - - .set have_bilinear_interpolate_four_pixels_8888_8888, 1 - - .macro bilinear_interpolate_four_pixels_8888_8888_head -diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h ---- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h -+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h -@@ -75,340 +75,340 @@ - #define PREFETCH_MODE pldl1keep - - /* - * Definitions of supplementary pixld/pixst macros (for partial load/store of - * pixel data). - */ - - .macro pixldst1 op, elem_size, reg1, mem_operand, abits -- op {v®1&.&elem_size}, [&mem_operand&], #8 -+ \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8 - .endm - - .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits -- op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 -+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16 - .endm - - .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits -- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 -+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32 - .endm - - .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes -- op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& -+ \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\() - .endm - - .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand -- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 -+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24 - .endm - - .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand -- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 -+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3 - .endm - - .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits --.if numbytes == 32 -- .if elem_size==32 -- pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits -- .elseif elem_size==16 -- pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits -+.if \numbytes == 32 -+ .if \elem_size==32 -+ pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits -+ .elseif \elem_size==16 -+ pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits - .else -- pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits -+ pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits - .endif --.elseif numbytes == 16 -- .if elem_size==32 -- pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits -- .elseif elem_size==16 -- pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits -+.elseif \numbytes == 16 -+ .if \elem_size==32 -+ pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits -+ .elseif \elem_size==16 -+ pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits - .else -- pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits -+ pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits - .endif --.elseif numbytes == 8 -- .if elem_size==32 -- pixldst1 op, 2s, %(basereg+1), mem_operand, abits -- .elseif elem_size==16 -- pixldst1 op, 4h, %(basereg+1), mem_operand, abits -+.elseif \numbytes == 8 -+ .if \elem_size==32 -+ pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits -+ .elseif \elem_size==16 -+ pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits - .else -- pixldst1 op, 8b, %(basereg+1), mem_operand, abits -+ pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits - .endif --.elseif numbytes == 4 -- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) -- pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 -- .elseif elem_size == 16 -- pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 -- pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 -+.elseif \numbytes == 4 -+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) -+ pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4 -+ .elseif \elem_size == 16 -+ pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2 -+ pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2 - .else -- pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 -- pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 -- pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 -- pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 -+ pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1 -+ pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1 -+ pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1 -+ pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1 - .endif --.elseif numbytes == 2 -- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) -- pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 -+.elseif \numbytes == 2 -+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) -+ pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2 - .else -- pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 -- pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 -+ pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1 -+ pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1 - .endif --.elseif numbytes == 1 -- pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 -+.elseif \numbytes == 1 -+ pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1 - .else -- .error "unsupported size: numbytes" -+ .error "unsupported size: \numbytes" - .endif - .endm - - .macro pixld numpix, bpp, basereg, mem_operand, abits=0 --.if bpp > 0 --.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits --.elseif (bpp == 24) && (numpix == 8) -- pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand --.elseif (bpp == 24) && (numpix == 4) -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand --.elseif (bpp == 24) && (numpix == 2) -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand --.elseif (bpp == 24) && (numpix == 1) -- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand -+.if \bpp > 0 -+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits -+.elseif (\bpp == 24) && (\numpix == 8) -+ pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand -+.elseif (\bpp == 24) && (\numpix == 4) -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 2) -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 1) -+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand - .else -- pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits -+ pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits - .endif - .endif - .endm - - .macro pixst numpix, bpp, basereg, mem_operand, abits=0 --.if bpp > 0 --.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits --.elseif (bpp == 24) && (numpix == 8) -- pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand --.elseif (bpp == 24) && (numpix == 4) -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand --.elseif (bpp == 24) && (numpix == 2) -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand --.elseif (bpp == 24) && (numpix == 1) -- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand --.elseif numpix * bpp == 32 && abits == 32 -- pixldst 4, st1, 32, basereg, mem_operand, abits --.elseif numpix * bpp == 16 && abits == 16 -- pixldst 2, st1, 16, basereg, mem_operand, abits -+.if \bpp > 0 -+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits -+.elseif (\bpp == 24) && (\numpix == 8) -+ pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand -+.elseif (\bpp == 24) && (\numpix == 4) -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 2) -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 1) -+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand -+.elseif \numpix * \bpp == 32 && \abits == 32 -+ pixldst 4, st1, 32, \basereg, \mem_operand, \abits -+.elseif \numpix * \bpp == 16 && \abits == 16 -+ pixldst 2, st1, 16, \basereg, \mem_operand, \abits - .else -- pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits -+ pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits - .endif - .endif - .endm - - .macro pixld_a numpix, bpp, basereg, mem_operand --.if (bpp * numpix) <= 128 -- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) -+.if (\bpp * \numpix) <= 128 -+ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) - .else -- pixld numpix, bpp, basereg, mem_operand, 128 -+ pixld \numpix, \bpp, \basereg, \mem_operand, 128 - .endif - .endm - - .macro pixst_a numpix, bpp, basereg, mem_operand --.if (bpp * numpix) <= 128 -- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) -+.if (\bpp * \numpix) <= 128 -+ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) - .else -- pixst numpix, bpp, basereg, mem_operand, 128 -+ pixst \numpix, \bpp, \basereg, \mem_operand, 128 - .endif - .endm - - /* - * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register - * aliases to be defined) - */ - .macro pixld1_s elem_size, reg1, mem_operand --.if elem_size == 16 -+.if \elem_size == 16 - asr TMP1, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP1, mem_operand, TMP1, lsl #1 -+ add TMP1, \mem_operand, TMP1, lsl #1 - asr TMP2, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP2, mem_operand, TMP2, lsl #1 -- ld1 {v®1&.h}[0], [TMP1] -+ add TMP2, \mem_operand, TMP2, lsl #1 -+ ld1 {v\()\reg1\().h}[0], [TMP1] - asr TMP1, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP1, mem_operand, TMP1, lsl #1 -- ld1 {v®1&.h}[1], [TMP2] -+ add TMP1, \mem_operand, TMP1, lsl #1 -+ ld1 {v\()\reg1\().h}[1], [TMP2] - asr TMP2, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP2, mem_operand, TMP2, lsl #1 -- ld1 {v®1&.h}[2], [TMP1] -- ld1 {v®1&.h}[3], [TMP2] --.elseif elem_size == 32 -+ add TMP2, \mem_operand, TMP2, lsl #1 -+ ld1 {v\()\reg1\().h}[2], [TMP1] -+ ld1 {v\()\reg1\().h}[3], [TMP2] -+.elseif \elem_size == 32 - asr TMP1, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP1, mem_operand, TMP1, lsl #2 -+ add TMP1, \mem_operand, TMP1, lsl #2 - asr TMP2, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP2, mem_operand, TMP2, lsl #2 -- ld1 {v®1&.s}[0], [TMP1] -- ld1 {v®1&.s}[1], [TMP2] -+ add TMP2, \mem_operand, TMP2, lsl #2 -+ ld1 {v\()\reg1\().s}[0], [TMP1] -+ ld1 {v\()\reg1\().s}[1], [TMP2] - .else - .error "unsupported" - .endif - .endm - - .macro pixld2_s elem_size, reg1, reg2, mem_operand --.if 0 /* elem_size == 32 */ -+.if 0 /* \elem_size == 32 */ - mov TMP1, VX, asr #16 - add VX, VX, UNIT_X, asl #1 -- add TMP1, mem_operand, TMP1, asl #2 -+ add TMP1, \mem_operand, TMP1, asl #2 - mov TMP2, VX, asr #16 - sub VX, VX, UNIT_X -- add TMP2, mem_operand, TMP2, asl #2 -- ld1 {v®1&.s}[0], [TMP1] -+ add TMP2, \mem_operand, TMP2, asl #2 -+ ld1 {v\()\reg1\().s}[0], [TMP1] - mov TMP1, VX, asr #16 - add VX, VX, UNIT_X, asl #1 -- add TMP1, mem_operand, TMP1, asl #2 -- ld1 {v®2&.s}[0], [TMP2, :32] -+ add TMP1, \mem_operand, TMP1, asl #2 -+ ld1 {v\()\reg2\().s}[0], [TMP2, :32] - mov TMP2, VX, asr #16 - add VX, VX, UNIT_X -- add TMP2, mem_operand, TMP2, asl #2 -- ld1 {v®1&.s}[1], [TMP1] -- ld1 {v®2&.s}[1], [TMP2] -+ add TMP2, \mem_operand, TMP2, asl #2 -+ ld1 {v\()\reg1\().s}[1], [TMP1] -+ ld1 {v\()\reg2\().s}[1], [TMP2] - .else -- pixld1_s elem_size, reg1, mem_operand -- pixld1_s elem_size, reg2, mem_operand -+ pixld1_s \elem_size, \reg1, \mem_operand -+ pixld1_s \elem_size, \reg2, \mem_operand - .endif - .endm - - .macro pixld0_s elem_size, reg1, idx, mem_operand --.if elem_size == 16 -+.if \elem_size == 16 - asr TMP1, VX, #16 - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP1, mem_operand, TMP1, lsl #1 -- ld1 {v®1&.h}[idx], [TMP1] --.elseif elem_size == 32 -+ add TMP1, \mem_operand, TMP1, lsl #1 -+ ld1 {v\()\reg1\().h}[\idx], [TMP1] -+.elseif \elem_size == 32 - asr DUMMY, VX, #16 - mov TMP1, DUMMY - adds VX, VX, UNIT_X - bmi 55f - 5: subs VX, VX, SRC_WIDTH_FIXED - bpl 5b - 55: -- add TMP1, mem_operand, TMP1, lsl #2 -- ld1 {v®1&.s}[idx], [TMP1] -+ add TMP1, \mem_operand, TMP1, lsl #2 -+ ld1 {v\()\reg1\().s}[\idx], [TMP1] - .endif - .endm - - .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand --.if numbytes == 32 -- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand -- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand -- pixdeinterleave elem_size, %(basereg+4) --.elseif numbytes == 16 -- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand --.elseif numbytes == 8 -- pixld1_s elem_size, %(basereg+1), mem_operand --.elseif numbytes == 4 -- .if elem_size == 32 -- pixld0_s elem_size, %(basereg+0), 1, mem_operand -- .elseif elem_size == 16 -- pixld0_s elem_size, %(basereg+0), 2, mem_operand -- pixld0_s elem_size, %(basereg+0), 3, mem_operand -+.if \numbytes == 32 -+ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand -+ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand -+ pixdeinterleave \elem_size, %(\basereg+4) -+.elseif \numbytes == 16 -+ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand -+.elseif \numbytes == 8 -+ pixld1_s \elem_size, %(\basereg+1), \mem_operand -+.elseif \numbytes == 4 -+ .if \elem_size == 32 -+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand -+ .elseif \elem_size == 16 -+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand - .else -- pixld0_s elem_size, %(basereg+0), 4, mem_operand -- pixld0_s elem_size, %(basereg+0), 5, mem_operand -- pixld0_s elem_size, %(basereg+0), 6, mem_operand -- pixld0_s elem_size, %(basereg+0), 7, mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand - .endif --.elseif numbytes == 2 -- .if elem_size == 16 -- pixld0_s elem_size, %(basereg+0), 1, mem_operand -+.elseif \numbytes == 2 -+ .if \elem_size == 16 -+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand - .else -- pixld0_s elem_size, %(basereg+0), 2, mem_operand -- pixld0_s elem_size, %(basereg+0), 3, mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand - .endif --.elseif numbytes == 1 -- pixld0_s elem_size, %(basereg+0), 1, mem_operand -+.elseif \numbytes == 1 -+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand - .else -- .error "unsupported size: numbytes" -+ .error "unsupported size: \numbytes" - .endif - .endm - - .macro pixld_s numpix, bpp, basereg, mem_operand --.if bpp > 0 -- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand -+.if \bpp > 0 -+ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand - .endif - .endm - - .macro vuzp8 reg1, reg2 - umov DUMMY, v16.d[0] -- uzp1 v16.8b, v®1&.8b, v®2&.8b -- uzp2 v®2&.8b, v®1&.8b, v®2&.8b -- mov v®1&.8b, v16.8b -+ uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b -+ uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b -+ mov v\()\reg1\().8b, v16.8b - mov v16.d[0], DUMMY - .endm - - .macro vzip8 reg1, reg2 - umov DUMMY, v16.d[0] -- zip1 v16.8b, v®1&.8b, v®2&.8b -- zip2 v®2&.8b, v®1&.8b, v®2&.8b -- mov v®1&.8b, v16.8b -+ zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b -+ zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b -+ mov v\()\reg1\().8b, v16.8b - mov v16.d[0], DUMMY - .endm - - /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ - .macro pixdeinterleave bpp, basereg --.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- vuzp8 %(basereg+0), %(basereg+1) -- vuzp8 %(basereg+2), %(basereg+3) -- vuzp8 %(basereg+1), %(basereg+3) -- vuzp8 %(basereg+0), %(basereg+2) -+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ vuzp8 %(\basereg+0), %(\basereg+1) -+ vuzp8 %(\basereg+2), %(\basereg+3) -+ vuzp8 %(\basereg+1), %(\basereg+3) -+ vuzp8 %(\basereg+0), %(\basereg+2) - .endif - .endm - - /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ - .macro pixinterleave bpp, basereg --.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- vzip8 %(basereg+0), %(basereg+2) -- vzip8 %(basereg+1), %(basereg+3) -- vzip8 %(basereg+2), %(basereg+3) -- vzip8 %(basereg+0), %(basereg+1) -+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ vzip8 %(\basereg+0), %(\basereg+2) -+ vzip8 %(\basereg+1), %(\basereg+3) -+ vzip8 %(\basereg+2), %(\basereg+3) -+ vzip8 %(\basereg+0), %(\basereg+1) - .endif - .endm - - /* - * This is a macro for implementing cache preload. The main idea is that - * cache preload logic is mostly independent from the rest of pixels - * processing code. It starts at the top left pixel and moves forward - * across pixels and can jump across scanlines. Prefetch distance is -@@ -432,62 +432,62 @@ 55: - * for almost zero cost! - * - * (*) The overhead of the prefetcher is visible when running some trivial - * pixels processing like simple copy. Anyway, having prefetch is a must - * when working with the graphics data. - */ - .macro PF a, x:vararg - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) -- a x -+ \a \x - .endif - .endm - - .macro cache_preload std_increment, boost_increment - .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) --.if std_increment != 0 -- PF add PF_X, PF_X, #std_increment -+.if \std_increment != 0 -+ PF add, PF_X, PF_X, #\std_increment - .endif -- PF tst PF_CTL, #0xF -- PF beq 71f -- PF add PF_X, PF_X, #boost_increment -- PF sub PF_CTL, PF_CTL, #1 -+ PF tst, PF_CTL, #0xF -+ PF beq, 71f -+ PF add, PF_X, PF_X, #\boost_increment -+ PF sub, PF_CTL, PF_CTL, #1 - 71: -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - .if src_bpp_shift >= 0 -- PF lsl DUMMY, PF_X, #src_bpp_shift -- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] -+ PF lsl, DUMMY, PF_X, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] - .endif - .if dst_r_bpp != 0 -- PF lsl DUMMY, PF_X, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] -+ PF lsl, DUMMY, PF_X, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] - .endif - .if mask_bpp_shift >= 0 -- PF lsl DUMMY, PF_X, #mask_bpp_shift -- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] -+ PF lsl, DUMMY, PF_X, #mask_bpp_shift -+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] - .endif -- PF ble 71f -- PF sub PF_X, PF_X, ORIG_W -- PF subs PF_CTL, PF_CTL, #0x10 -+ PF ble, 71f -+ PF sub, PF_X, PF_X, ORIG_W -+ PF subs, PF_CTL, PF_CTL, #0x10 - 71: -- PF ble 72f -+ PF ble, 72f - .if src_bpp_shift >= 0 -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF ldrsb DUMMY, [PF_SRC, DUMMY] -- PF add PF_SRC, PF_SRC, #1 -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF ldrsb, DUMMY, [PF_SRC, DUMMY] -+ PF add, PF_SRC, PF_SRC, #1 - .endif - .if dst_r_bpp != 0 -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF ldrsb DUMMY, [PF_DST, DUMMY] -- PF add PF_DST, PF_DST, #1 -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF ldrsb, DUMMY, [PF_DST, DUMMY] -+ PF add, PF_DST, PF_DST, #1 - .endif - .if mask_bpp_shift >= 0 -- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift -- PF ldrsb DUMMY, [PF_MASK, DUMMY] -- PF add PF_MASK, PF_MASK, #1 -+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift -+ PF ldrsb, DUMMY, [PF_MASK, DUMMY] -+ PF add, PF_MASK, PF_MASK, #1 - .endif - 72: - .endif - .endm - - .macro cache_preload_simple - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) - .if src_bpp > 0 -@@ -516,56 +516,56 @@ 72: - process_pixblock_tail, \ - process_pixblock_tail_head - .if dst_w_bpp != 24 - tst DST_R, #0xF - beq 52f - - .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 - .irp lowbit, 1, 2, 4, 8, 16 --local skip1 --.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) --.if lowbit < 16 /* we don't need more than 16-byte alignment */ -- tst DST_R, #lowbit -+ -+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) -+.if \lowbit < 16 /* we don't need more than 16-byte alignment */ -+ tst DST_R, #\lowbit - beq 51f - .endif -- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC -- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK -+ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC -+ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK - .if dst_r_bpp > 0 -- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R -+ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R - .else -- add DST_R, DST_R, #lowbit -+ add DST_R, DST_R, #\lowbit - .endif -- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) -- sub W, W, #(lowbit * 8 / dst_w_bpp) -+ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) -+ sub W, W, #(\lowbit * 8 / dst_w_bpp) - 51: - .endif - .endr - .endif - pixdeinterleave src_bpp, src_basereg - pixdeinterleave mask_bpp, mask_basereg - pixdeinterleave dst_r_bpp, dst_r_basereg - -- process_pixblock_head -+ \process_pixblock_head - cache_preload 0, pixblock_size - cache_preload_simple -- process_pixblock_tail -+ \process_pixblock_tail - - pixinterleave dst_w_bpp, dst_w_basereg - - .irp lowbit, 1, 2, 4, 8, 16 --.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) --.if lowbit < 16 /* we don't need more than 16-byte alignment */ -- tst DST_W, #lowbit -+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) -+.if \lowbit < 16 /* we don't need more than 16-byte alignment */ -+ tst DST_W, #\lowbit - beq 51f - .endif - .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 -- sub W, W, #(lowbit * 8 / dst_w_bpp) -+ sub W, W, #(\lowbit * 8 / dst_w_bpp) - .endif -- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W -+ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W - 51: - .endif - .endr - .endif - 52: - .endm - - /* -@@ -587,52 +587,52 @@ 52: - dst_aligned_flag, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - tst W, #(pixblock_size - 1) - beq 52f - .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 - .irp chunk_size, 16, 8, 4, 2, 1 --.if pixblock_size > chunk_size -- tst W, #chunk_size -+.if pixblock_size > \chunk_size -+ tst W, #\chunk_size - beq 51f -- pixld_src chunk_size, src_bpp, src_basereg, SRC -- pixld chunk_size, mask_bpp, mask_basereg, MASK --.if dst_aligned_flag != 0 -- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R -+ pixld_src \chunk_size, src_bpp, src_basereg, SRC -+ pixld \chunk_size, mask_bpp, mask_basereg, MASK -+.if \dst_aligned_flag != 0 -+ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R - .else -- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R -+ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R - .endif --.if cache_preload_flag != 0 -- PF add PF_X, PF_X, #chunk_size -+.if \cache_preload_flag != 0 -+ PF add, PF_X, PF_X, #\chunk_size - .endif - 51: - .endif - .endr - .endif - pixdeinterleave src_bpp, src_basereg - pixdeinterleave mask_bpp, mask_basereg - pixdeinterleave dst_r_bpp, dst_r_basereg - -- process_pixblock_head --.if cache_preload_flag != 0 -+ \process_pixblock_head -+.if \cache_preload_flag != 0 - cache_preload 0, pixblock_size - cache_preload_simple - .endif -- process_pixblock_tail -+ \process_pixblock_tail - pixinterleave dst_w_bpp, dst_w_basereg - .irp chunk_size, 16, 8, 4, 2, 1 --.if pixblock_size > chunk_size -- tst W, #chunk_size -+.if pixblock_size > \chunk_size -+ tst W, #\chunk_size - beq 51f --.if dst_aligned_flag != 0 -- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W -+.if \dst_aligned_flag != 0 -+ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W - .else -- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W -+ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W - .endif - 51: - .endif - .endr - 52: - .endm - - /* -@@ -655,17 +655,17 @@ 52: - .if (src_bpp != 24) && (src_bpp != 0) - sub SRC, SRC, W, lsl #src_bpp_shift - .endif - .if (mask_bpp != 24) && (mask_bpp != 0) - sub MASK, MASK, W, lsl #mask_bpp_shift - .endif - subs H, H, #1 - mov DST_R, DST_W -- bge start_of_loop_label -+ bge \start_of_loop_label - .endm - - /* - * Registers are allocated in the following way by default: - * v0, v1, v2, v3 - reserved for loading source pixel data - * v4, v5, v6, v7 - reserved for loading destination pixel data - * v24, v25, v26, v27 - reserved for loading mask pixel data - * v28, v29, v30, v31 - final destination pixel data for writeback to memory -@@ -682,17 +682,17 @@ 52: - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head, \ - dst_w_basereg_ = 28, \ - dst_r_basereg_ = 4, \ - src_basereg_ = 0, \ - mask_basereg_ = 24 - -- pixman_asm_function fname -+ pixman_asm_function \fname - stp x29, x30, [sp, -16]! - mov x29, sp - sub sp, sp, 232 /* push all registers */ - sub x29, x29, 64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 - stp x8, x9, [x29, -80] - stp x10, x11, [x29, -96] -@@ -707,38 +707,38 @@ 52: - str x28, [x29, -232] - - /* - * Select prefetch type for this function. If prefetch distance is - * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch - * has to be used instead of ADVANCED. - */ - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT --.if prefetch_distance == 0 -+.if \prefetch_distance == 0 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ -- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) -+ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE - .endif - - /* - * Make some macro arguments globally visible and accessible - * from other macros - */ -- .set src_bpp, src_bpp_ -- .set mask_bpp, mask_bpp_ -- .set dst_w_bpp, dst_w_bpp_ -- .set pixblock_size, pixblock_size_ -- .set dst_w_basereg, dst_w_basereg_ -- .set dst_r_basereg, dst_r_basereg_ -- .set src_basereg, src_basereg_ -- .set mask_basereg, mask_basereg_ -+ .set src_bpp, \src_bpp_ -+ .set mask_bpp, \mask_bpp_ -+ .set dst_w_bpp, \dst_w_bpp_ -+ .set pixblock_size, \pixblock_size_ -+ .set dst_w_basereg, \dst_w_basereg_ -+ .set dst_r_basereg, \dst_r_basereg_ -+ .set src_basereg, \src_basereg_ -+ .set mask_basereg, \mask_basereg_ - - .macro pixld_src x:vararg -- pixld x -+ pixld \x - .endm - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC - .endm - /* - * Assign symbolic names to registers - */ -@@ -805,32 +805,32 @@ 52: - .elseif dst_w_bpp == 16 - .set dst_bpp_shift, 1 - .elseif dst_w_bpp == 8 - .set dst_bpp_shift, 0 - .else - .error "requested dst bpp (dst_w_bpp) is not supported" - .endif - --.if (((flags) & FLAG_DST_READWRITE) != 0) -+.if (((\flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp - .else - .set dst_r_bpp, 0 - .endif --.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) -+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) - .set DEINTERLEAVE_32BPP_ENABLED, 1 - .else - .set DEINTERLEAVE_32BPP_ENABLED, 0 - .endif - --.if prefetch_distance < 0 || prefetch_distance > 15 -- .error "invalid prefetch distance (prefetch_distance)" -+.if \prefetch_distance < 0 || \prefetch_distance > 15 -+ .error "invalid prefetch distance (\prefetch_distance)" - .endif - -- PF mov PF_X, #0 -+ PF mov, PF_X, #0 - mov DST_R, DST_W - - .if src_bpp == 24 - sub SRC_STRIDE, SRC_STRIDE, W - sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 - .endif - .if mask_bpp == 24 - sub MASK_STRIDE, MASK_STRIDE, W -@@ -839,71 +839,71 @@ 52: - .if dst_w_bpp == 24 - sub DST_STRIDE, DST_STRIDE, W - sub DST_STRIDE, DST_STRIDE, W, lsl #1 - .endif - - /* - * Setup advanced prefetcher initial state - */ -- PF mov PF_SRC, SRC -- PF mov PF_DST, DST_R -- PF mov PF_MASK, MASK -- /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ -- PF lsl DUMMY, H, #4 -- PF mov PF_CTL, DUMMY -- PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) -+ PF mov, PF_SRC, SRC -+ PF mov, PF_DST, DST_R -+ PF mov, PF_MASK, MASK -+ /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */ -+ PF lsl, DUMMY, H, #4 -+ PF mov, PF_CTL, DUMMY -+ PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10) - -- init -+ \init - subs H, H, #1 - mov ORIG_W, W - blt 9f - cmp W, #(pixblock_size * 2) - blt 800f - /* - * This is the start of the pipelined loop, which if optimized for - * long scanlines - */ - 0: -- ensure_destination_ptr_alignment process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ ensure_destination_ptr_alignment \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ - pixld_a pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- PF add PF_X, PF_X, #pixblock_size -- process_pixblock_head -+ PF add, PF_X, PF_X, #pixblock_size -+ \process_pixblock_head - cache_preload 0, pixblock_size - cache_preload_simple - subs W, W, #(pixblock_size * 2) - blt 200f - - 100: -- process_pixblock_tail_head -+ \process_pixblock_tail_head - cache_preload_simple - subs W, W, #pixblock_size - bge 100b - - 200: -- process_pixblock_tail -+ \process_pixblock_tail - pixst_a pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - - /* Process the remaining trailing pixels in the scanline */ - process_trailing_pixels 1, 1, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - advance_to_next_scanline 0b - -- cleanup -+ \cleanup - 1000: - /* pop all registers */ - sub x29, x29, 64 - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - ldp x8, x9, [x29, -80] - ldp x10, x11, [x29, -96] - ldp x12, x13, [x29, -112] -@@ -920,48 +920,48 @@ 1000: - ret /* exit */ - /* - * This is the start of the loop, designed to process images with small width - * (less than pixblock_size * 2 pixels). In this case neither pipelining - * nor prefetch are used. - */ - 800: - .if src_bpp_shift >= 0 -- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift -- PF prfm PREFETCH_MODE, [SRC, DUMMY] -+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift -+ PF prfm, PREFETCH_MODE, [SRC, DUMMY] - .endif - .if dst_r_bpp != 0 -- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift -- PF prfm PREFETCH_MODE, [DST_R, DUMMY] -+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift -+ PF prfm, PREFETCH_MODE, [DST_R, DUMMY] - .endif - .if mask_bpp_shift >= 0 -- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift -- PF prfm PREFETCH_MODE, [MASK, DUMMY] -+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift -+ PF prfm, PREFETCH_MODE, [MASK, DUMMY] - .endif - /* Process exactly pixblock_size pixels if needed */ - tst W, #pixblock_size - beq 100f - pixld pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- process_pixblock_head -- process_pixblock_tail -+ \process_pixblock_head -+ \process_pixblock_tail - pixst pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - 100: - /* Process the remaining trailing pixels in the scanline */ - process_trailing_pixels 0, 0, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - advance_to_next_scanline 800b - 9: -- cleanup -+ \cleanup - /* pop all registers */ - sub x29, x29, 64 - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - ldp x8, x9, [x29, -80] - ldp x10, x11, [x29, -96] - ldp x12, x13, [x29, -112] - ldp x14, x15, [x29, -128] -@@ -990,17 +990,17 @@ 9: - .unreq DST_STRIDE - .unreq MASK_STRIDE - .unreq PF_CTL - .unreq PF_X - .unreq PF_SRC - .unreq PF_DST - .unreq PF_MASK - .unreq DUMMY -- .endfunc -+ pixman_end_asm_function - .endm - - /* - * A simplified variant of function generation template for a single - * scanline processing (for implementing pixman combine functions) - */ - .macro generate_composite_function_scanline use_nearest_scaling, \ - fname, \ -@@ -1014,50 +1014,50 @@ 9: - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head, \ - dst_w_basereg_ = 28, \ - dst_r_basereg_ = 4, \ - src_basereg_ = 0, \ - mask_basereg_ = 24 - -- pixman_asm_function fname -+ pixman_asm_function \fname - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - - /* - * Make some macro arguments globally visible and accessible - * from other macros - */ -- .set src_bpp, src_bpp_ -- .set mask_bpp, mask_bpp_ -- .set dst_w_bpp, dst_w_bpp_ -- .set pixblock_size, pixblock_size_ -- .set dst_w_basereg, dst_w_basereg_ -- .set dst_r_basereg, dst_r_basereg_ -- .set src_basereg, src_basereg_ -- .set mask_basereg, mask_basereg_ -+ .set src_bpp, \src_bpp_ -+ .set mask_bpp, \mask_bpp_ -+ .set dst_w_bpp, \dst_w_bpp_ -+ .set pixblock_size, \pixblock_size_ -+ .set dst_w_basereg, \dst_w_basereg_ -+ .set dst_r_basereg, \dst_r_basereg_ -+ .set src_basereg, \src_basereg_ -+ .set mask_basereg, \mask_basereg_ - --.if use_nearest_scaling != 0 -+.if \use_nearest_scaling != 0 - /* - * Assign symbolic names to registers for nearest scaling - */ - W .req x0 - DST_W .req x1 - SRC .req x2 - VX .req x3 - UNIT_X .req x4 - SRC_WIDTH_FIXED .req x5 - MASK .req x6 - TMP1 .req x8 - TMP2 .req x9 - DST_R .req x10 - DUMMY .req x30 - - .macro pixld_src x:vararg -- pixld_s x -+ pixld_s \x - .endm - - sxtw x0, w0 - sxtw x3, w3 - sxtw x4, w4 - sxtw x5, w5 - - stp x29, x30, [sp, -16]! -@@ -1075,84 +1075,84 @@ 9: - W .req x0 /* width (is updated during processing) */ - DST_W .req x1 /* destination buffer pointer for writes */ - SRC .req x2 /* source buffer pointer */ - MASK .req x3 /* mask pointer */ - DST_R .req x4 /* destination buffer pointer for reads */ - DUMMY .req x30 - - .macro pixld_src x:vararg -- pixld x -+ pixld \x - .endm - - sxtw x0, w0 - - stp x29, x30, [sp, -16]! - mov x29, sp - sub sp, sp, 64 - sub x29, x29, 64 - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - .endif - --.if (((flags) & FLAG_DST_READWRITE) != 0) -+.if (((\flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp - .else - .set dst_r_bpp, 0 - .endif --.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) -+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) - .set DEINTERLEAVE_32BPP_ENABLED, 1 - .else - .set DEINTERLEAVE_32BPP_ENABLED, 0 - .endif - - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC - .endm - -- init -+ \init - mov DST_R, DST_W - - cmp W, #pixblock_size - blt 800f - -- ensure_destination_ptr_alignment process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ ensure_destination_ptr_alignment \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - - subs W, W, #pixblock_size - blt 700f - - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ - pixld_a pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- process_pixblock_head -+ \process_pixblock_head - subs W, W, #pixblock_size - blt 200f - 100: -- process_pixblock_tail_head -+ \process_pixblock_tail_head - subs W, W, #pixblock_size - bge 100b - 200: -- process_pixblock_tail -+ \process_pixblock_tail - pixst_a pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - 700: - /* Process the remaining trailing pixels in the scanline (dst aligned) */ - process_trailing_pixels 0, 1, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - -- cleanup --.if use_nearest_scaling != 0 -+ \cleanup -+.if \use_nearest_scaling != 0 - sub x29, x29, 64 - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - ldp x8, x9, [x29, -80] - ldr x10, [x29, -96] - mov sp, x29 - ldp x29, x30, [sp], 16 - ret /* exit */ -@@ -1162,22 +1162,22 @@ 700: - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - mov sp, x29 - ldp x29, x30, [sp], 16 - ret /* exit */ - .endif - 800: - /* Process the remaining trailing pixels in the scanline (dst unaligned) */ - process_trailing_pixels 0, 0, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - -- cleanup --.if use_nearest_scaling != 0 -+ \cleanup -+.if \use_nearest_scaling != 0 - sub x29, x29, 64 - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 - ldp x8, x9, [x29, -80] - ldr x10, [x29, -88] - mov sp, x29 - ldp x29, x30, [sp], 16 - ret /* exit */ -@@ -1208,25 +1208,25 @@ 800: - .unreq DST_R - .unreq DST_W - .unreq W - .endif - - .purgem fetch_src_pixblock - .purgem pixld_src - -- .endfunc -+ pixman_end_asm_function - .endm - - .macro generate_composite_function_single_scanline x:vararg -- generate_composite_function_scanline 0, x -+ generate_composite_function_scanline 0, \x - .endm - - .macro generate_composite_function_nearest_scanline x:vararg -- generate_composite_function_scanline 1, x -+ generate_composite_function_scanline 1, \x - .endm - - /* Default prologue/epilogue, nothing special needs to be done */ - - .macro default_init - .endm - - .macro default_cleanup -@@ -1250,61 +1250,61 @@ 800: - * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) - * into a planar a8r8g8b8 format (with a, r, g, b color components - * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). - * - * Warning: the conversion is destructive and the original - * value (in) is lost. - */ - .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b -- shrn &out_r&.8b, &in&.8h, #8 -- shrn &out_g&.8b, &in&.8h, #3 -- sli &in&.8h, &in&.8h, #5 -- movi &out_a&.8b, #255 -- sri &out_r&.8b, &out_r&.8b, #5 -- sri &out_g&.8b, &out_g&.8b, #6 -- shrn &out_b&.8b, &in&.8h, #2 -+ shrn \()\out_r\().8b, \()\in\().8h, #8 -+ shrn \()\out_g\().8b, \()\in\().8h, #3 -+ sli \()\in\().8h, \()\in\().8h, #5 -+ movi \()\out_a\().8b, #255 -+ sri \()\out_r\().8b, \()\out_r\().8b, #5 -+ sri \()\out_g\().8b, \()\out_g\().8b, #6 -+ shrn \()\out_b\().8b, \()\in\().8h, #2 - .endm - - .macro convert_0565_to_x888 in, out_r, out_g, out_b -- shrn &out_r&.8b, &in&.8h, #8 -- shrn &out_g&.8b, &in&.8h, #3 -- sli &in&.8h, &in&.8h, #5 -- sri &out_r&.8b, &out_r&.8b, #5 -- sri &out_g&.8b, &out_g&.8b, #6 -- shrn &out_b&.8b, &in&.8h, #2 -+ shrn \()\out_r\().8b, \()\in\().8h, #8 -+ shrn \()\out_g\().8b, \()\in\().8h, #3 -+ sli \()\in\().8h, \()\in\().8h, #5 -+ sri \()\out_r\().8b, \()\out_r\().8b, #5 -+ sri \()\out_g\().8b, \()\out_g\().8b, #6 -+ shrn \()\out_b\().8b, \()\in\().8h, #2 - .endm - - /* - * Conversion from planar a8r8g8b8 format (with a, r, g, b color components - * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 - * pixels packed in 128-bit register (out). Requires two temporary 128-bit - * registers (tmp1, tmp2) - */ - .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 -- ushll &tmp1&.8h, &in_g&.8b, #7 -- shl &tmp1&.8h, &tmp1&.8h, #1 -- ushll &out&.8h, &in_r&.8b, #7 -- shl &out&.8h, &out&.8h, #1 -- ushll &tmp2&.8h, &in_b&.8b, #7 -- shl &tmp2&.8h, &tmp2&.8h, #1 -- sri &out&.8h, &tmp1&.8h, #5 -- sri &out&.8h, &tmp2&.8h, #11 -+ ushll \()\tmp1\().8h, \()\in_g\().8b, #7 -+ shl \()\tmp1\().8h, \()\tmp1\().8h, #1 -+ ushll \()\out\().8h, \()\in_r\().8b, #7 -+ shl \()\out\().8h, \()\out\().8h, #1 -+ ushll \()\tmp2\().8h, \()\in_b\().8b, #7 -+ shl \()\tmp2\().8h, \()\tmp2\().8h, #1 -+ sri \()\out\().8h, \()\tmp1\().8h, #5 -+ sri \()\out\().8h, \()\tmp2\().8h, #11 - .endm - - /* - * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels - * returned in (out0, out1) registers pair. Requires one temporary - * 64-bit register (tmp). 'out1' and 'in' may overlap, the original - * value from 'in' is lost - */ - .macro convert_four_0565_to_x888_packed in, out0, out1, tmp -- shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ -- shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ -- sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ -- sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ -- sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ -- ushr &out1&.4h, &in&.4h, #8 /* R is in place */ -- sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ -- zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ -- zip2 &out1&.4h, &out0&.4h, &out1&.4h -- mov &out0&.d[0], &tmp&.d[0] -+ shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */ -+ shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */ -+ sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */ -+ sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */ -+ sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */ -+ ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */ -+ sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */ -+ zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */ -+ zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h -+ mov \()\out0\().d[0], \()\tmp\().d[0] - .endm |