diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /gfx/cairo/pixman-arm64-clang.patch | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/cairo/pixman-arm64-clang.patch')
-rw-r--r-- | gfx/cairo/pixman-arm64-clang.patch | 3756 |
1 files changed, 3756 insertions, 0 deletions
diff --git a/gfx/cairo/pixman-arm64-clang.patch b/gfx/cairo/pixman-arm64-clang.patch new file mode 100644 index 0000000000..f059734531 --- /dev/null +++ b/gfx/cairo/pixman-arm64-clang.patch @@ -0,0 +1,3756 @@ +https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71 + +diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h +--- a/gfx/cairo/libpixman/src/pixman-arm-asm.h ++++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h +@@ -21,17 +21,33 @@ + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + + /* Supplementary macro for setting function attributes */ +-.macro pixman_asm_function fname +- .func fname +- .global fname ++.macro pixman_asm_function_impl fname ++#ifdef ASM_HAVE_FUNC_DIRECTIVE ++ .func \fname ++#endif ++ .global \fname + #ifdef __ELF__ +- .hidden fname +- .type fname, %function ++ .hidden \fname ++ .type \fname, %function + #endif +-fname: ++\fname: + .endm ++ ++.macro pixman_asm_function fname ++#ifdef ASM_LEADING_UNDERSCORE ++ pixman_asm_function_impl _\fname ++#else ++ pixman_asm_function_impl \fname ++#endif ++.endm ++ ++.macro pixman_end_asm_function ++#ifdef ASM_HAVE_FUNC_DIRECTIVE ++ .endfunc ++#endif ++.endm +diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S +--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S ++++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S +@@ -72,219 +72,219 @@ + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + + .macro bilinear_load_8888 reg1, reg2, tmp + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #2 +- ld1 {®1&.2s}, [TMP1], STRIDE +- ld1 {®2&.2s}, [TMP1] ++ ld1 {\()\reg1\().2s}, [TMP1], STRIDE ++ ld1 {\()\reg2\().2s}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 +- ld1 {®2&.s}[0], [TMP1], STRIDE +- ld1 {®2&.s}[1], [TMP1] +- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\reg2\().s}[1], [TMP1] ++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp + .endm + + .macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +- bilinear_load_8888 reg1, reg2, tmp1 +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- bilinear_load_8888 reg3, reg4, tmp2 +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ bilinear_load_8888 \reg1, \reg2, \tmp1 ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ bilinear_load_8888 \reg3, \reg4, \tmp2 ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ +- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi + .endm + + .macro vzip reg1, reg2 +- zip1 v24.8b, reg1, reg2 +- zip2 reg2, reg1, reg2 +- mov reg1, v24.8b ++ zip1 v24.8b, \reg1, \reg2 ++ zip2 \reg2, \reg1, \reg2 ++ mov \reg1, v24.8b + .endm + + .macro vuzp reg1, reg2 +- uzp1 v24.8b, reg1, reg2 +- uzp2 reg2, reg1, reg2 +- mov reg1, v24.8b ++ uzp1 v24.8b, \reg1, \reg2 ++ uzp2 \reg2, \reg1, \reg2 ++ mov \reg1, v24.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr WTMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&acc2&.s}[0], [TMP1], STRIDE +- ld1 {&acc2&.s}[2], [TMP2], STRIDE +- ld1 {&acc2&.s}[1], [TMP1] +- ld1 {&acc2&.s}[3], [TMP2] +- convert_0565_to_x888 acc2, reg3, reg2, reg1 +- vzip ®1&.8b, ®3&.8b +- vzip ®2&.8b, ®4&.8b +- vzip ®3&.8b, ®4&.8b +- vzip ®1&.8b, ®2&.8b +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\acc2\().s}[1], [TMP1] ++ ld1 {\()\acc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 ++ vzip \()\reg1\().8b, \()\reg3\().8b ++ vzip \()\reg2\().8b, \()\reg4\().8b ++ vzip \()\reg3\().8b, \()\reg4\().8b ++ vzip \()\reg1\().8b, \()\reg2\().8b ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_0565 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr WTMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&xacc2&.s}[0], [TMP1], STRIDE +- ld1 {&xacc2&.s}[2], [TMP2], STRIDE +- ld1 {&xacc2&.s}[1], [TMP1] +- ld1 {&xacc2&.s}[3], [TMP2] +- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\xacc2\().s}[1], [TMP1] ++ ld1 {\()\xacc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 + asr WTMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr WTMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&yacc2&.s}[0], [TMP1], STRIDE +- vzip &xreg1&.8b, &xreg3&.8b +- ld1 {&yacc2&.s}[2], [TMP2], STRIDE +- vzip &xreg2&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[1], [TMP1] +- vzip &xreg3&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[3], [TMP2] +- vzip &xreg1&.8b, &xreg2&.8b +- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +- umull &xacc1&.8h, &xreg1&.8b, v28.8b +- vzip &yreg1&.8b, &yreg3&.8b +- umlal &xacc1&.8h, &xreg2&.8b, v29.8b +- vzip &yreg2&.8b, &yreg4&.8b +- umull &xacc2&.8h, &xreg3&.8b, v28.8b +- vzip &yreg3&.8b, &yreg4&.8b +- umlal &xacc2&.8h, &xreg4&.8b, v29.8b +- vzip &yreg1&.8b, &yreg2&.8b +- umull &yacc1&.8h, &yreg1&.8b, v28.8b +- umlal &yacc1&.8h, &yreg2&.8b, v29.8b +- umull &yacc2&.8h, &yreg3&.8b, v28.8b +- umlal &yacc2&.8h, &yreg4&.8b, v29.8b ++ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE ++ vzip \()\xreg1\().8b, \()\xreg3\().8b ++ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE ++ vzip \()\xreg2\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[1], [TMP1] ++ vzip \()\xreg3\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[3], [TMP2] ++ vzip \()\xreg1\().8b, \()\xreg2\().8b ++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 ++ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b ++ vzip \()\yreg1\().8b, \()\yreg3\().8b ++ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b ++ vzip \()\yreg2\().8b, \()\yreg4\().8b ++ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b ++ vzip \()\yreg3\().8b, \()\yreg4\().8b ++ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b ++ vzip \()\yreg1\().8b, \()\yreg2\().8b ++ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b ++ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b ++ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b ++ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b + .endm + + .macro bilinear_store_8888 numpix, tmp1, tmp2 +-.if numpix == 4 ++.if \numpix == 4 + st1 {v0.2s, v1.2s}, [OUT], #16 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v0.2s}, [OUT], #8 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v0.s}[0], [OUT], #4 + .else +- .error bilinear_store_8888 numpix is unsupported ++ .error bilinear_store_8888 \numpix is unsupported + .endif + .endm + + .macro bilinear_store_0565 numpix, tmp1, tmp2 + vuzp v0.8b, v1.8b + vuzp v2.8b, v3.8b + vuzp v1.8b, v3.8b + vuzp v0.8b, v2.8b +- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 +-.if numpix == 4 ++ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 ++.if \numpix == 4 + st1 {v1.4h}, [OUT], #8 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v1.s}[0], [OUT], #4 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v1.h}[0], [OUT], #2 + .else +- .error bilinear_store_0565 numpix is unsupported ++ .error bilinear_store_0565 \numpix is unsupported + .endif + .endm + + + /* + * Macros for loading mask pixels into register 'mask'. + * dup must be done in somewhere else. + */ + .macro bilinear_load_mask_x numpix, mask + .endm + + .macro bilinear_load_mask_8 numpix, mask +-.if numpix == 4 +- ld1 {&mask&.s}[0], [MASK], #4 +-.elseif numpix == 2 +- ld1 {&mask&.h}[0], [MASK], #2 +-.elseif numpix == 1 +- ld1 {&mask&.b}[0], [MASK], #1 ++.if \numpix == 4 ++ ld1 {\()\mask\().s}[0], [MASK], #4 ++.elseif \numpix == 2 ++ ld1 {\()\mask\().h}[0], [MASK], #2 ++.elseif \numpix == 1 ++ ld1 {\()\mask\().b}[0], [MASK], #1 + .else +- .error bilinear_load_mask_8 numpix is unsupported ++ .error bilinear_load_mask_8 \numpix is unsupported + .endif +- prfm PREFETCH_MODE, [MASK, #prefetch_offset] ++ prfum PREFETCH_MODE, [MASK, #(prefetch_offset)] + .endm + + .macro bilinear_load_mask mask_fmt, numpix, mask +- bilinear_load_mask_&mask_fmt numpix, mask ++ bilinear_load_mask_\mask_fmt \numpix, \mask + .endm + + + /* + * Macros for loading destination pixels into register 'dst0' and 'dst1'. + * Interleave should be done somewhere else. + */ + .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 +-.if numpix == 4 +- ld1 {&dst0&.2s, &dst1&.2s}, [OUT] +-.elseif numpix == 2 +- ld1 {&dst0&.2s}, [OUT] +-.elseif numpix == 1 +- ld1 {&dst0&.s}[0], [OUT] ++.if \numpix == 4 ++ ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT] ++.elseif \numpix == 2 ++ ld1 {\()\dst0\().2s}, [OUT] ++.elseif \numpix == 1 ++ ld1 {\()\dst0\().s}[0], [OUT] + .else +- .error bilinear_load_dst_8888 numpix is unsupported ++ .error bilinear_load_dst_8888 \numpix is unsupported + .endif +- mov &dst01&.d[0], &dst0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] ++ mov \()\dst01\().d[0], \()\dst0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] + prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] + .endm + + .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 +- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 +- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 +- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 ++ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 + .endm + + /* + * Macros for duplicating partially loaded mask to fill entire register. + * We will apply mask to interleaved source pixels, that is + * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) + * So, we need to duplicate loaded mask into whole register. +@@ -293,84 +293,85 @@ + * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) + * We can do some optimizations for this including last pixel cases. + */ + .macro bilinear_duplicate_mask_x numpix, mask + .endm + + .macro bilinear_duplicate_mask_8 numpix, mask +-.if numpix == 4 +- dup &mask&.2s, &mask&.s[0] +-.elseif numpix == 2 +- dup &mask&.4h, &mask&.h[0] +-.elseif numpix == 1 +- dup &mask&.8b, &mask&.b[0] ++.if \numpix == 4 ++ dup \()\mask\().2s, \()\mask\().s[0] ++.elseif \numpix == 2 ++ dup \()\mask\().4h, \()\mask\().h[0] ++.elseif \numpix == 1 ++ dup \()\mask\().8b, \()\mask\().b[0] + .else +- .error bilinear_duplicate_mask_8 is unsupported ++ .error bilinear_duplicate_\mask_8 is unsupported + .endif + .endm + + .macro bilinear_duplicate_mask mask_fmt, numpix, mask +- bilinear_duplicate_mask_&mask_fmt numpix, mask ++ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask + .endm + + /* + * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. + * Interleave should be done when maks is enabled or operator is 'over'. + */ + .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 +- vuzp &src0&.8b, &src1&.8b +- vuzp &dst0&.8b, &dst1&.8b +- vuzp &src0&.8b, &src1&.8b +- vuzp &dst0&.8b, &dst1&.8b +- mov &src01&.d[1], &src1&.d[0] +- mov &src01&.d[0], &src0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] +- mov &dst01&.d[0], &dst0&.d[0] ++ vuzp \()\src0\().8b, \()\src1\().8b ++ vuzp \()\dst0\().8b, \()\dst1\().8b ++ vuzp \()\src0\().8b, \()\src1\().8b ++ vuzp \()\dst0\().8b, \()\dst1\().8b ++ mov \()\src01\().d[1], \()\src1\().d[0] ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] ++ mov \()\dst01\().d[0], \()\dst0\().d[0] + .endm + + .macro bilinear_interleave_src_dst_x_src \ + numpix, src0, src1, src01, dst0, dst1, dst01 + .endm + + .macro bilinear_interleave_src_dst_x_over \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_x_add \ + numpix, src0, src1, src01, dst0, dst1, dst01 +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_8_src \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_8_over \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst_8_add \ + numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_interleave_src_dst \ + mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 + +- bilinear_interleave_src_dst_&mask_fmt&_&op \ +- numpix, src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ ++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 + .endm + + + /* + * Macros for applying masks to src pixels. (see combine_mask_u() function) + * src, dst should be in interleaved form. + * mask register should be in form (m0, m1, m2, m3). + */ +@@ -378,191 +379,191 @@ + numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + .endm + + .macro bilinear_apply_mask_to_src_8 \ + numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + +- umull &tmp01&.8h, &src0&.8b, &mask&.8b +- umull &tmp23&.8h, &src1&.8b, &mask&.8b ++ umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b ++ umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b + /* bubbles */ +- urshr &tmp45&.8h, &tmp01&.8h, #8 +- urshr &tmp67&.8h, &tmp23&.8h, #8 ++ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 ++ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 + /* bubbles */ +- raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h +- raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h +- mov &src01&.d[0], &src0&.d[0] +- mov &src01&.d[1], &src1&.d[0] ++ raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h ++ raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\src01\().d[1], \()\src1\().d[0] + .endm + + .macro bilinear_apply_mask_to_src \ + mask_fmt, numpix, src0, src1, src01, mask, \ + tmp01, tmp23, tmp45, tmp67 + +- bilinear_apply_mask_to_src_&mask_fmt \ +- numpix, src0, src1, src01, mask, \ +- tmp01, tmp23, tmp45, tmp67 ++ bilinear_apply_mask_to_src_\()\mask_fmt \ ++ \numpix, \src0, \src1, \src01, \mask, \ ++ \tmp01, \tmp23, \tmp45, \tmp67 + .endm + + + /* + * Macros for combining src and destination pixels. + * Interleave or not is depending on operator 'op'. + */ + .macro bilinear_combine_src \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + .endm + + .macro bilinear_combine_over \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- dup &tmp8&.2s, &src1&.s[1] ++ dup \()\tmp8\().2s, \()\src1\().s[1] + /* bubbles */ +- mvn &tmp8&.8b, &tmp8&.8b ++ mvn \()\tmp8\().8b, \()\tmp8\().8b + /* bubbles */ +- umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b ++ umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b + /* bubbles */ +- umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b ++ umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b + /* bubbles */ +- urshr &tmp45&.8h, &tmp01&.8h, #8 +- urshr &tmp67&.8h, &tmp23&.8h, #8 ++ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 ++ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 + /* bubbles */ +- raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h +- raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h +- mov &dst01&.d[0], &dst0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] ++ raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h ++ raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h ++ mov \()\dst01\().d[0], \()\dst0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] + /* bubbles */ +- uqadd &src0&.8b, &dst0&.8b, &src0&.8b +- uqadd &src1&.8b, &dst1&.8b, &src1&.8b +- mov &src01&.d[0], &src0&.d[0] +- mov &src01&.d[1], &src1&.d[0] ++ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b ++ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\src01\().d[1], \()\src1\().d[0] + .endm + + .macro bilinear_combine_add \ + numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- uqadd &src0&.8b, &dst0&.8b, &src0&.8b +- uqadd &src1&.8b, &dst1&.8b, &src1&.8b +- mov &src01&.d[0], &src0&.d[0] +- mov &src01&.d[1], &src1&.d[0] ++ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b ++ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b ++ mov \()\src01\().d[0], \()\src0\().d[0] ++ mov \()\src01\().d[1], \()\src1\().d[0] + .endm + + .macro bilinear_combine \ + op, numpix, src0, src1, src01, dst0, dst1, dst01, \ + tmp01, tmp23, tmp45, tmp67, tmp8 + +- bilinear_combine_&op \ +- numpix, src0, src1, src01, dst0, dst1, dst01, \ +- tmp01, tmp23, tmp45, tmp67, tmp8 ++ bilinear_combine_\()\op \ ++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ ++ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 + .endm + + /* + * Macros for final deinterleaving of destination pixels if needed. + */ + .macro bilinear_deinterleave numpix, dst0, dst1, dst01 +- vuzp &dst0&.8b, &dst1&.8b ++ vuzp \()\dst0\().8b, \()\dst1\().8b + /* bubbles */ +- vuzp &dst0&.8b, &dst1&.8b +- mov &dst01&.d[0], &dst0&.d[0] +- mov &dst01&.d[1], &dst1&.d[0] ++ vuzp \()\dst0\().8b, \()\dst1\().8b ++ mov \()\dst01\().d[0], \()\dst0\().d[0] ++ mov \()\dst01\().d[1], \()\dst1\().d[0] + .endm + + .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 + .endm + + .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 +- bilinear_deinterleave numpix, dst0, dst1, dst01 ++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 + .endm + + .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 +- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 ++ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 + .endm + + + .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_&src_fmt v0, v1, v2 +- bilinear_load_mask mask_fmt, 1, v4 +- bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 ++ bilinear_load_\()\src_fmt v0, v1, v2 ++ bilinear_load_mask \mask_fmt, 1, v4 ++ bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9 + umull v2.8h, v0.8b, v28.8b + umlal v2.8h, v1.8b, v29.8b + /* 5 cycles bubble */ + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v2.4h, v15.h[0] + umlal2 v0.4s, v2.8h, v15.h[0] + /* 5 cycles bubble */ +- bilinear_duplicate_mask mask_fmt, 1, v4 ++ bilinear_duplicate_mask \mask_fmt, 1, v4 + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + /* 3 cycles bubble */ + xtn v0.8b, v0.8h + /* 1 cycle bubble */ + bilinear_interleave_src_dst \ +- mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 ++ \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9 + bilinear_apply_mask_to_src \ +- mask_fmt, 1, v0, v1, v0, v4, \ ++ \mask_fmt, 1, v0, v1, v0, v4, \ + v3, v8, v10, v11 + bilinear_combine \ +- op, 1, v0, v1, v0, v18, v19, v9, \ ++ \op, 1, v0, v1, v0, v18, v19, v9, \ + v3, v8, v10, v11, v5 +- bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 +- bilinear_store_&dst_fmt 1, v17, v18 ++ bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0 ++ bilinear_store_\()\dst_fmt 1, v17, v18 + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ + v1, v11, v18, v19, v20, v21, v22, v23 +- bilinear_load_mask mask_fmt, 2, v4 +- bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 ++ bilinear_load_mask \mask_fmt, 2, v4 ++ bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) +- bilinear_duplicate_mask mask_fmt, 2, v4 ++ bilinear_duplicate_mask \mask_fmt, 2, v4 + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + xtn v0.8b, v0.8h + bilinear_interleave_src_dst \ +- mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 ++ \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9 + bilinear_apply_mask_to_src \ +- mask_fmt, 2, v0, v1, v0, v4, \ ++ \mask_fmt, 2, v0, v1, v0, v4, \ + v3, v8, v10, v11 + bilinear_combine \ +- op, 2, v0, v1, v0, v18, v19, v9, \ ++ \op, 2, v0, v1, v0, v18, v19, v9, \ + v3, v8, v10, v11, v5 +- bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 +- bilinear_store_&dst_fmt 2, v16, v17 ++ bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0 ++ bilinear_store_\()\dst_fmt 2, v16, v17 + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op +- bilinear_load_and_vertical_interpolate_four_&src_fmt \ +- v1, v11, v4, v5, v6, v7, v22, v23 \ ++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ ++ v1, v11, v4, v5, v6, v7, v22, v23, \ + v3, v9, v16, v17, v20, v21, v18, v19 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + sub TMP1, TMP1, STRIDE + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS +@@ -575,33 +576,33 @@ + ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v8.4s, v9.4h, v15.h[4] + umlal2 v8.4s, v9.8h, v15.h[4] + add v12.8h, v12.8h, v13.8h + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) +- bilinear_load_mask mask_fmt, 4, v4 +- bilinear_duplicate_mask mask_fmt, 4, v4 ++ bilinear_load_mask \mask_fmt, 4, v4 ++ bilinear_duplicate_mask \mask_fmt, 4, v4 + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + xtn v0.8b, v0.8h + xtn v1.8b, v2.8h + add v12.8h, v12.8h, v13.8h +- bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 ++ bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21 + bilinear_interleave_src_dst \ +- mask_fmt, op, 4, v0, v1, v0, v2, v3, v11 ++ \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11 + bilinear_apply_mask_to_src \ +- mask_fmt, 4, v0, v1, v0, v4, \ ++ \mask_fmt, 4, v0, v1, v0, v4, \ + v6, v8, v9, v10 + bilinear_combine \ +- op, 4, v0, v1, v0, v2, v3, v1, \ ++ \op, 4, v0, v1, v0, v2, v3, v1, \ + v6, v8, v9, v10, v23 +- bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 +- bilinear_store_&dst_fmt 4, v6, v7 ++ bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0 ++ bilinear_store_\()\dst_fmt 4, v6, v7 + .endm + + .set BILINEAR_FLAG_USE_MASK, 1 + .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + + /* + * Main template macro for generating NEON optimized bilinear scanline functions. + * +@@ -631,24 +632,24 @@ + bilinear_process_four_pixels, \ + bilinear_process_pixblock_head, \ + bilinear_process_pixblock_tail, \ + bilinear_process_pixblock_tail_head, \ + pixblock_size, \ + prefetch_distance, \ + flags + +-pixman_asm_function fname +-.if pixblock_size == 8 +-.elseif pixblock_size == 4 ++pixman_asm_function \fname ++.if \pixblock_size == 8 ++.elseif \pixblock_size == 4 + .else + .error unsupported pixblock size + .endif + +-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 + OUT .req x0 + TOP .req x1 + BOTTOM .req x2 + WT .req x3 + WWT .req w3 + WB .req x4 + WWB .req w4 + X .req w5 +@@ -694,32 +695,32 @@ pixman_asm_function fname + PF_OFFS .req x12 + TMP3 .req x13 + WTMP3 .req w13 + TMP4 .req x14 + WTMP4 .req w14 + STRIDE .req x15 + DUMMY .req x30 + +- .set prefetch_offset, prefetch_distance ++ .set prefetch_offset, \prefetch_distance + + stp x29, x30, [sp, -16]! + mov x29, sp + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x10, x11, [x29, -80] + stp x12, x13, [x29, -96] + stp x14, x15, [x29, -112] + str x8, [x29, -120] + ldr w8, [x29, 16] + sub sp, sp, 120 + .endif + +- mov WTMP1, #prefetch_distance ++ mov WTMP1, #\prefetch_distance + umull PF_OFFS, WTMP1, UX + + sub STRIDE, BOTTOM, TOP + .unreq BOTTOM + + cmp WIDTH, #0 + ble 300f + +@@ -730,73 +731,73 @@ pixman_asm_function fname + mov v25.d[0], v12.d[1] + mov v26.d[0], v13.d[0] + add v25.4h, v25.4h, v26.4h + mov v12.d[1], v25.d[0] + + /* ensure good destination alignment */ + cmp WIDTH, #1 + blt 100f +- tst OUT, #(1 << dst_bpp_shift) ++ tst OUT, #(1 << \dst_bpp_shift) + beq 100f + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h +- bilinear_process_last_pixel ++ \bilinear_process_last_pixel + sub WIDTH, WIDTH, #1 + 100: + add v13.8h, v13.8h, v13.8h + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + + cmp WIDTH, #2 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 1)) ++ tst OUT, #(1 << (\dst_bpp_shift + 1)) + beq 100f +- bilinear_process_two_pixels ++ \bilinear_process_two_pixels + sub WIDTH, WIDTH, #2 + 100: +-.if pixblock_size == 8 ++.if \pixblock_size == 8 + cmp WIDTH, #4 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 2)) ++ tst OUT, #(1 << (\dst_bpp_shift + 2)) + beq 100f +- bilinear_process_four_pixels ++ \bilinear_process_four_pixels + sub WIDTH, WIDTH, #4 + 100: + .endif +- subs WIDTH, WIDTH, #pixblock_size ++ subs WIDTH, WIDTH, #\pixblock_size + blt 100f +- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) +- bilinear_process_pixblock_head +- subs WIDTH, WIDTH, #pixblock_size ++ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) ++ \bilinear_process_pixblock_head ++ subs WIDTH, WIDTH, #\pixblock_size + blt 500f + 0: +- bilinear_process_pixblock_tail_head +- subs WIDTH, WIDTH, #pixblock_size ++ \bilinear_process_pixblock_tail_head ++ subs WIDTH, WIDTH, #\pixblock_size + bge 0b + 500: +- bilinear_process_pixblock_tail ++ \bilinear_process_pixblock_tail + 100: +-.if pixblock_size == 8 ++.if \pixblock_size == 8 + tst WIDTH, #4 + beq 200f +- bilinear_process_four_pixels ++ \bilinear_process_four_pixels + 200: + .endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 200f +- bilinear_process_two_pixels ++ \bilinear_process_two_pixels + 200: + tst WIDTH, #1 + beq 300f +- bilinear_process_last_pixel ++ \bilinear_process_last_pixel + 300: + +-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x10, x11, [x29, -80] + ldp x12, x13, [x29, -96] + ldp x14, x15, [x29, -112] + mov sp, x29 + ldp x29, x30, [sp], 16 +@@ -824,21 +825,21 @@ 300: + .unreq WIDTH + .unreq TMP1 + .unreq WTMP1 + .unreq TMP2 + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 + .unreq STRIDE +-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 ++.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 + .unreq MASK + .endif + +-.endfunc ++pixman_end_asm_function + + .endm + + /* src_8888_8_8888 */ + .macro bilinear_src_8888_8_8888_process_last_pixel + bilinear_interpolate_last_pixel 8888, 8, 8888, src + .endm + +diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S +--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S ++++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S +@@ -262,64 +262,64 @@ + uqadd v18.8b, v0.8b, v22.8b + uqadd v19.8b, v1.8b, v23.8b + shrn v6.8b, v4.8h, #8 + fetch_src_pixblock + shrn v7.8b, v4.8h, #3 + sli v4.8h, v4.8h, #5 + ushll v14.8h, v17.8b, #7 + sli v14.8h, v14.8h, #1 +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + ushll v8.8h, v19.8b, #7 + sli v8.8h, v8.8h, #1 +- PF tst PF_CTL, #0xF ++ PF tst, PF_CTL, #0xF + sri v6.8b, v6.8b, #5 +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + mvn v3.8b, v3.8b +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + sri v7.8b, v7.8b, #6 + shrn v30.8b, v4.8h, #2 + umull v10.8h, v3.8b, v6.8b +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + umull v11.8h, v3.8b, v7.8b + umull v12.8h, v3.8b, v30.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + sri v14.8h, v8.8h, #5 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + ushll v9.8h, v18.8b, #7 + sli v9.8h, v9.8h, #1 + urshr v17.8h, v10.8h, #8 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + urshr v19.8h, v11.8h, #8 + urshr v18.8h, v12.8h, #8 +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + raddhn v20.8b, v10.8h, v17.8h + raddhn v23.8b, v11.8h, v19.8h +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_SRC, #1 + 10: + raddhn v22.8b, v12.8h, v18.8h + st1 {v14.8h}, [DST_W], #16 + .endm + + #else + + /* If we did not care much about the performance, we would just use this... */ +@@ -469,42 +469,42 @@ generate_composite_function \ + sri v14.8h, v8.8h, #5 + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] + .endm + + .macro pixman_composite_src_8888_0565_process_pixblock_tail_head + sri v14.8h, v8.8h, #5 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + fetch_src_pixblock +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + sri v14.8h, v9.8h, #11 + mov v28.d[0], v14.d[0] + mov v29.d[0], v14.d[1] +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + ushll v8.8h, v1.8b, #7 + sli v8.8h, v8.8h, #1 + st1 {v14.8h}, [DST_W], #16 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + ushll v14.8h, v2.8b, #7 + sli v14.8h, v14.8h, #1 +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + ushll v9.8h, v0.8b, #7 + sli v9.8h, v9.8h, #1 + .endm + + generate_composite_function \ + pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ +@@ -561,41 +561,41 @@ generate_composite_function \ + uqadd v31.8b, v3.8b, v7.8b + .endm + + .macro pixman_composite_add_8_8_process_pixblock_tail + .endm + + .macro pixman_composite_add_8_8_process_pixblock_tail_head + fetch_src_pixblock +- PF add PF_X, PF_X, #32 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #32 ++ PF tst, PF_CTL, #0xF + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 +- PF beq 10f +- PF add PF_X, PF_X, #32 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #32 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + uqadd v28.8b, v0.8b, v4.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b + .endm + + generate_composite_function \ + pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ +@@ -607,41 +607,41 @@ generate_composite_function \ + pixman_composite_add_8_8_process_pixblock_head, \ + pixman_composite_add_8_8_process_pixblock_tail, \ + pixman_composite_add_8_8_process_pixblock_tail_head + + /******************************************************************************/ + + .macro pixman_composite_add_8888_8888_process_pixblock_tail_head + fetch_src_pixblock +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + uqadd v28.8b, v0.8b, v4.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + uqadd v29.8b, v1.8b, v5.8b + uqadd v30.8b, v2.8b, v6.8b + uqadd v31.8b, v3.8b, v7.8b + .endm + + generate_composite_function \ + pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ +@@ -684,55 +684,55 @@ generate_composite_function_single_scanl + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + .endm + + .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + fetch_src_pixblock +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v8.8h, v22.8b, v4.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v9.8h, v22.8b, v5.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + umull v10.8h, v22.8b, v6.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + umull v11.8h, v22.8b, v7.8b + .endm + + generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ +@@ -754,59 +754,59 @@ generate_composite_function_single_scanl + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + .endm + + .macro pixman_composite_over_8888_8888_process_pixblock_tail_head + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v14.8h, v8.8h, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + fetch_src_pixblock +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + mvn v22.8b, v3.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v8.8h, v22.8b, v4.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v9.8h, v22.8b, v5.8b +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + umull v10.8h, v22.8b, v6.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + umull v11.8h, v22.8b, v7.8b + .endm + + generate_composite_function \ + pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ +@@ -860,40 +860,40 @@ generate_composite_function_single_scanl + urshr v16.8h, v10.8h, #8 + urshr v17.8h, v11.8h, #8 + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h + raddhn v30.8b, v16.8h, v10.8h + raddhn v31.8b, v17.8h, v11.8h + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + uqadd v28.8b, v0.8b, v28.8b +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0x0F +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0x0F ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + umull v8.8h, v24.8b, v4.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + umull v9.8h, v24.8b, v5.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v10.8h, v24.8b, v6.8b +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF subs, PF_CTL, PF_CTL, #0x10 + umull v11.8h, v24.8b, v7.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + .endm + + .macro pixman_composite_over_n_8888_init + mov v3.s[0], w4 + dup v0.8b, v3.b[0] + dup v1.8b, v3.b[1] +@@ -912,52 +912,52 @@ generate_composite_function \ + pixman_composite_over_8888_8888_process_pixblock_head, \ + pixman_composite_over_8888_8888_process_pixblock_tail, \ + pixman_composite_over_n_8888_process_pixblock_tail_head + + /******************************************************************************/ + + .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head + urshr v14.8h, v8.8h, #8 +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF + urshr v15.8h, v9.8h, #8 + urshr v12.8h, v10.8h, #8 + urshr v13.8h, v11.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v28.8b, v14.8h, v8.8h + raddhn v29.8b, v15.8h, v9.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v30.8b, v12.8h, v10.8h + raddhn v31.8b, v13.8h, v11.8h + uqadd v28.8b, v0.8b, v28.8b + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 + mvn v22.8b, v3.8b +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF blt 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF blt, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v8.8h, v22.8b, v4.8b +- PF blt 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF blt, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v9.8h, v22.8b, v5.8b + umull v10.8h, v22.8b, v6.8b +- PF blt 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF blt, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + umull v11.8h, v22.8b, v7.8b + .endm + + .macro pixman_composite_over_reverse_n_8888_init + mov v7.s[0], w4 + dup v4.8b, v7.b[0] + dup v5.8b, v7.b[1] +@@ -1405,45 +1405,45 @@ generate_composite_function \ + rshrn v28.8b, v8.8h, #8 + rshrn v29.8b, v9.8h, #8 + rshrn v30.8b, v10.8h, #8 + rshrn v31.8b, v11.8h, #8 + .endm + + .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head + fetch_mask_pixblock +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + rshrn v28.8b, v8.8h, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + rshrn v29.8b, v9.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + rshrn v30.8b, v10.8h, #8 +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + rshrn v31.8b, v11.8h, #8 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + umull v8.8h, v24.8b, v0.8b +- PF lsl DUMMY, PF_X, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + umull v9.8h, v24.8b, v1.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v10.8h, v24.8b, v2.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v11.8h, v24.8b, v3.8b +- PF ble 10f +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + 10: + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v8.8h, v8.8h, #8 + ursra v9.8h, v9.8h, #8 + ursra v10.8h, v10.8h, #8 + ursra v11.8h, v11.8h, #8 + .endm + +@@ -1486,45 +1486,45 @@ generate_composite_function \ + rshrn v28.8b, v0.8h, #8 + rshrn v29.8b, v1.8h, #8 + rshrn v30.8b, v2.8h, #8 + rshrn v31.8b, v3.8h, #8 + .endm + + .macro pixman_composite_src_n_8_8_process_pixblock_tail_head + fetch_mask_pixblock +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + rshrn v28.8b, v0.8h, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + rshrn v29.8b, v1.8h, #8 +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + rshrn v30.8b, v2.8h, #8 +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + rshrn v31.8b, v3.8h, #8 +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + umull v0.8h, v24.8b, v16.8b +- PF lsl DUMMY, PF_X, mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + umull v1.8h, v25.8b, v16.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v2.8h, v26.8b, v16.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v3.8h, v27.8b, v16.8b +- PF ble 10f +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + 10: + st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 + ursra v0.8h, v0.8h, #8 + ursra v1.8h, v1.8h, #8 + ursra v2.8h, v2.8h, #8 + ursra v3.8h, v3.8h, #8 + .endm + +@@ -1594,54 +1594,54 @@ generate_composite_function \ + .endm + + .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head + urshr v16.8h, v12.8h, #8 + ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 + urshr v17.8h, v13.8h, #8 + fetch_mask_pixblock + urshr v18.8h, v14.8h, #8 +- PF add PF_X, PF_X, #8 ++ PF add, PF_X, PF_X, #8 + urshr v19.8h, v15.8h, #8 +- PF tst PF_CTL, #0x0F ++ PF tst, PF_CTL, #0x0F + raddhn v28.8b, v16.8h, v12.8h +- PF beq 10f +- PF add PF_X, PF_X, #8 ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 + 10: + raddhn v29.8b, v17.8h, v13.8h +- PF beq 10f +- PF sub PF_CTL, PF_CTL, #1 ++ PF beq, 10f ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v30.8b, v18.8h, v14.8h +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + raddhn v31.8b, v19.8h, v15.8h +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + umull v16.8h, v24.8b, v8.8b +- PF lsl DUMMY, PF_X, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + umull v17.8h, v24.8b, v9.8b +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W + 10: + umull v18.8h, v24.8b, v10.8b +- PF ble 10f +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF subs, PF_CTL, PF_CTL, #0x10 + 10: + umull v19.8h, v24.8b, v11.8b +- PF ble 10f +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + 10: + uqadd v28.8b, v0.8b, v28.8b +- PF ble 10f +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF ble, 10f ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + 10: + uqadd v29.8b, v1.8b, v29.8b + uqadd v30.8b, v2.8b, v30.8b + uqadd v31.8b, v3.8b, v31.8b + urshr v12.8h, v16.8h, #8 + urshr v13.8h, v17.8h, #8 + urshr v14.8h, v18.8h, #8 + urshr v15.8h, v19.8h, #8 +@@ -2407,17 +2407,17 @@ generate_composite_function \ + generate_composite_function_single_scanline \ + pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ + pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ ++ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + + /******************************************************************************/ + + .macro pixman_composite_over_8888_n_8888_process_pixblock_head +@@ -2482,31 +2482,31 @@ generate_composite_function \ + pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ ++ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + + generate_composite_function_single_scanline \ + pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ ++ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 12 /* mask_basereg */ + + /******************************************************************************/ + + /* TODO: expand macros and do better instructions scheduling */ +@@ -2524,17 +2524,17 @@ generate_composite_function \ + pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init_need_all_regs, \ + default_cleanup_need_all_regs, \ + pixman_composite_over_8888_n_8888_process_pixblock_head, \ + pixman_composite_over_8888_n_8888_process_pixblock_tail, \ +- pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ ++ pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \ + 28, /* dst_w_basereg */ \ + 4, /* dst_r_basereg */ \ + 0, /* src_basereg */ \ + 15 /* mask_basereg */ + + /******************************************************************************/ + + .macro pixman_composite_src_0888_0888_process_pixblock_head +@@ -2675,38 +2675,38 @@ generate_composite_function \ + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v31.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + fetch_src_pixblock + raddhn v30.8b, v11.8h, v8.8h +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v29.8b, v12.8h, v9.8h + raddhn v28.8b, v13.8h, v10.8h + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + .endm + + generate_composite_function \ + pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -2744,38 +2744,38 @@ generate_composite_function \ + urshr v11.8h, v8.8h, #8 + mov v30.8b, v31.8b + mov v31.8b, v3.8b + mov v3.8b, v30.8b + urshr v12.8h, v9.8h, #8 + urshr v13.8h, v10.8h, #8 + fetch_src_pixblock + raddhn v28.8b, v11.8h, v8.8h +- PF add PF_X, PF_X, #8 +- PF tst PF_CTL, #0xF +- PF beq 10f +- PF add PF_X, PF_X, #8 +- PF sub PF_CTL, PF_CTL, #1 ++ PF add, PF_X, PF_X, #8 ++ PF tst, PF_CTL, #0xF ++ PF beq, 10f ++ PF add, PF_X, PF_X, #8 ++ PF sub, PF_CTL, PF_CTL, #1 + 10: + raddhn v29.8b, v12.8h, v9.8h + raddhn v30.8b, v13.8h, v10.8h + umull v8.8h, v3.8b, v0.8b + umull v9.8h, v3.8b, v1.8b + umull v10.8h, v3.8b, v2.8b + st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 +- PF cmp PF_X, ORIG_W +- PF lsl DUMMY, PF_X, src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +- PF ble 10f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 +- PF ble 10f +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF cmp, PF_X, ORIG_W ++ PF lsl, DUMMY, PF_X, src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF ble, 10f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 ++ PF ble, 10f ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + 10: + .endm + + generate_composite_function \ + pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 10, /* prefetch distance */ \ +@@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan + * format conversion, and interpolation as separate macros which can be used + * as the basic building blocks for constructing bilinear scanline functions. + */ + + .macro bilinear_load_8888 reg1, reg2, tmp + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #2 +- ld1 {®1&.2s}, [TMP1], STRIDE +- ld1 {®2&.2s}, [TMP1] ++ ld1 {\()\reg1\().2s}, [TMP1], STRIDE ++ ld1 {\()\reg2\().2s}, [TMP1] + .endm + + .macro bilinear_load_0565 reg1, reg2, tmp + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 +- ld1 {®2&.s}[0], [TMP1], STRIDE +- ld1 {®2&.s}[1], [TMP1] +- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\reg2\().s}[1], [TMP1] ++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp + .endm + + .macro bilinear_load_and_vertical_interpolate_two_8888 \ + acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 + +- bilinear_load_8888 reg1, reg2, tmp1 +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- bilinear_load_8888 reg3, reg4, tmp2 +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ bilinear_load_8888 \reg1, \reg2, \tmp1 ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ bilinear_load_8888 \reg3, \reg4, \tmp2 ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + + bilinear_load_and_vertical_interpolate_two_8888 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi + bilinear_load_and_vertical_interpolate_two_8888 \ +- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi + .endm + + .macro vzip reg1, reg2 + umov TMP4, v31.d[0] +- zip1 v31.8b, reg1, reg2 +- zip2 reg2, reg1, reg2 +- mov reg1, v31.8b ++ zip1 v31.8b, \reg1, \reg2 ++ zip2 \reg2, \reg1, \reg2 ++ mov \reg1, v31.8b + mov v31.d[0], TMP4 + .endm + + .macro vuzp reg1, reg2 + umov TMP4, v31.d[0] +- uzp1 v31.8b, reg1, reg2 +- uzp2 reg2, reg1, reg2 +- mov reg1, v31.8b ++ uzp1 v31.8b, \reg1, \reg2 ++ uzp2 \reg2, \reg1, \reg2 ++ mov \reg1, v31.8b + mov v31.d[0], TMP4 + .endm + + .macro bilinear_load_and_vertical_interpolate_two_0565 \ + acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr TMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&acc2&.s}[0], [TMP1], STRIDE +- ld1 {&acc2&.s}[2], [TMP2], STRIDE +- ld1 {&acc2&.s}[1], [TMP1] +- ld1 {&acc2&.s}[3], [TMP2] +- convert_0565_to_x888 acc2, reg3, reg2, reg1 +- vzip ®1&.8b, ®3&.8b +- vzip ®2&.8b, ®4&.8b +- vzip ®3&.8b, ®4&.8b +- vzip ®1&.8b, ®2&.8b +- umull &acc1&.8h, ®1&.8b, v28.8b +- umlal &acc1&.8h, ®2&.8b, v29.8b +- umull &acc2&.8h, ®3&.8b, v28.8b +- umlal &acc2&.8h, ®4&.8b, v29.8b ++ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\acc2\().s}[1], [TMP1] ++ ld1 {\()\acc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 ++ vzip \()\reg1\().8b, \()\reg3\().8b ++ vzip \()\reg2\().8b, \()\reg4\().8b ++ vzip \()\reg3\().8b, \()\reg4\().8b ++ vzip \()\reg1\().8b, \()\reg2\().8b ++ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b ++ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b ++ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b ++ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b + .endm + + .macro bilinear_load_and_vertical_interpolate_four_0565 \ +- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ + yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr TMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&xacc2&.s}[0], [TMP1], STRIDE +- ld1 {&xacc2&.s}[2], [TMP2], STRIDE +- ld1 {&xacc2&.s}[1], [TMP1] +- ld1 {&xacc2&.s}[3], [TMP2] +- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE ++ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE ++ ld1 {\()\xacc2\().s}[1], [TMP1] ++ ld1 {\()\xacc2\().s}[3], [TMP2] ++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 + asr TMP1, X, #16 + add X, X, UX + add TMP1, TOP, TMP1, lsl #1 + asr TMP2, X, #16 + add X, X, UX + add TMP2, TOP, TMP2, lsl #1 +- ld1 {&yacc2&.s}[0], [TMP1], STRIDE +- vzip &xreg1&.8b, &xreg3&.8b +- ld1 {&yacc2&.s}[2], [TMP2], STRIDE +- vzip &xreg2&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[1], [TMP1] +- vzip &xreg3&.8b, &xreg4&.8b +- ld1 {&yacc2&.s}[3], [TMP2] +- vzip &xreg1&.8b, &xreg2&.8b +- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 +- umull &xacc1&.8h, &xreg1&.8b, v28.8b +- vzip &yreg1&.8b, &yreg3&.8b +- umlal &xacc1&.8h, &xreg2&.8b, v29.8b +- vzip &yreg2&.8b, &yreg4&.8b +- umull &xacc2&.8h, &xreg3&.8b, v28.8b +- vzip &yreg3&.8b, &yreg4&.8b +- umlal &xacc2&.8h, &xreg4&.8b, v29.8b +- vzip &yreg1&.8b, &yreg2&.8b +- umull &yacc1&.8h, &yreg1&.8b, v28.8b +- umlal &yacc1&.8h, &yreg2&.8b, v29.8b +- umull &yacc2&.8h, &yreg3&.8b, v28.8b +- umlal &yacc2&.8h, &yreg4&.8b, v29.8b ++ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE ++ vzip \()\xreg1\().8b, \()\xreg3\().8b ++ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE ++ vzip \()\xreg2\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[1], [TMP1] ++ vzip \()\xreg3\().8b, \()\xreg4\().8b ++ ld1 {\()\yacc2\().s}[3], [TMP2] ++ vzip \()\xreg1\().8b, \()\xreg2\().8b ++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 ++ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b ++ vzip \()\yreg1\().8b, \()\yreg3\().8b ++ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b ++ vzip \()\yreg2\().8b, \()\yreg4\().8b ++ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b ++ vzip \()\yreg3\().8b, \()\yreg4\().8b ++ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b ++ vzip \()\yreg1\().8b, \()\yreg2\().8b ++ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b ++ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b ++ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b ++ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b + .endm + + .macro bilinear_store_8888 numpix, tmp1, tmp2 +-.if numpix == 4 ++.if \numpix == 4 + st1 {v0.2s, v1.2s}, [OUT], #16 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v0.2s}, [OUT], #8 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v0.s}[0], [OUT], #4 + .else +- .error bilinear_store_8888 numpix is unsupported ++ .error bilinear_store_8888 \numpix is unsupported + .endif + .endm + + .macro bilinear_store_0565 numpix, tmp1, tmp2 + vuzp v0.8b, v1.8b + vuzp v2.8b, v3.8b + vuzp v1.8b, v3.8b + vuzp v0.8b, v2.8b +- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 +-.if numpix == 4 ++ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 ++.if \numpix == 4 + st1 {v1.4h}, [OUT], #8 +-.elseif numpix == 2 ++.elseif \numpix == 2 + st1 {v1.s}[0], [OUT], #4 +-.elseif numpix == 1 ++.elseif \numpix == 1 + st1 {v1.h}[0], [OUT], #2 + .else +- .error bilinear_store_0565 numpix is unsupported ++ .error bilinear_store_0565 \numpix is unsupported + .endif + .endm + + .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt +- bilinear_load_&src_fmt v0, v1, v2 ++ bilinear_load_\()\src_fmt v0, v1, v2 + umull v2.8h, v0.8b, v28.8b + umlal v2.8h, v1.8b, v29.8b + /* 5 cycles bubble */ + ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v2.4h, v15.h[0] + umlal2 v0.4s, v2.8h, v15.h[0] + /* 5 cycles bubble */ + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + /* 3 cycles bubble */ + xtn v0.8b, v0.8h + /* 1 cycle bubble */ +- bilinear_store_&dst_fmt 1, v3, v4 ++ bilinear_store_\()\dst_fmt 1, v3, v4 + .endm + + .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt +- bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ + v1, v11, v2, v3, v20, v21, v22, v23 + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] + umlal2 v10.4s, v11.8h, v15.h[4] + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + xtn v0.8b, v0.8h +- bilinear_store_&dst_fmt 2, v3, v4 ++ bilinear_store_\()\dst_fmt 2, v3, v4 + .endm + + .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt +- bilinear_load_and_vertical_interpolate_four_&src_fmt \ +- v1, v11, v14, v20, v16, v17, v22, v23 \ ++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ ++ v1, v11, v14, v20, v16, v17, v22, v23, \ + v3, v9, v24, v25, v26, v27, v18, v19 + prfm PREFETCH_MODE, [TMP1, PF_OFFS] + sub TMP1, TMP1, STRIDE + ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v0.4s, v1.4h, v15.h[0] + umlal2 v0.4s, v1.8h, v15.h[0] + ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS + umlsl v10.4s, v11.4h, v15.h[4] +@@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan + shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + xtn v0.8b, v0.8h + xtn v1.8b, v2.8h + add v12.8h, v12.8h, v13.8h +- bilinear_store_&dst_fmt 4, v3, v4 ++ bilinear_store_\()\dst_fmt 4, v3, v4 + .endm + + .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head + .else +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail + .endif + .endm + + .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head ++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head + .else +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head + .else +- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail + .else +- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt + .endif + .endm + + .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt +-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt +- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head ++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt ++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head + .else +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + .endif + .endm + + .set BILINEAR_FLAG_UNROLL_4, 0 + .set BILINEAR_FLAG_UNROLL_8, 1 + .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 + + /* +@@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan + * prefetch_distance - prefetch in the source image by that many + * pixels ahead + */ + + .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ + src_bpp_shift, dst_bpp_shift, \ + prefetch_distance, flags + +-pixman_asm_function fname ++pixman_asm_function \fname + OUT .req x0 + TOP .req x1 + BOTTOM .req x2 + WT .req x3 + WB .req x4 + X .req x5 + UX .req x6 + WIDTH .req x7 +@@ -3437,17 +3437,17 @@ pixman_asm_function fname + sub sp, sp, 112 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] + stp x12, x13, [x29, -112] + +- mov PF_OFFS, #prefetch_distance ++ mov PF_OFFS, #\prefetch_distance + mul PF_OFFS, PF_OFFS, UX + + subs STRIDE, BOTTOM, TOP + .unreq BOTTOM + + cmp WIDTH, #0 + ble 300f + +@@ -3458,85 +3458,85 @@ pixman_asm_function fname + mov v25.d[0], v12.d[1] + mov v26.d[0], v13.d[0] + add v25.4h, v25.4h, v26.4h + mov v12.d[1], v25.d[0] + + /* ensure good destination alignment */ + cmp WIDTH, #1 + blt 100f +- tst OUT, #(1 << dst_bpp_shift) ++ tst OUT, #(1 << \dst_bpp_shift) + beq 100f + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h +- bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #1 + 100: + add v13.8h, v13.8h, v13.8h + ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) + add v12.8h, v12.8h, v13.8h + + cmp WIDTH, #2 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 1)) ++ tst OUT, #(1 << (\dst_bpp_shift + 1)) + beq 100f +- bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #2 + 100: +-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 ++.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 + /*********** 8 pixels per iteration *****************/ + cmp WIDTH, #4 + blt 100f +- tst OUT, #(1 << (dst_bpp_shift + 2)) ++ tst OUT, #(1 << (\dst_bpp_shift + 2)) + beq 100f +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + sub WIDTH, WIDTH, #4 + 100: + subs WIDTH, WIDTH, #8 + blt 100f +- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) +- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) ++ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #8 + blt 500f + 1000: +- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #8 + bge 1000b + 500: +- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt + 100: + tst WIDTH, #4 + beq 200f +- bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt + 200: + .else + /*********** 4 pixels per iteration *****************/ + subs WIDTH, WIDTH, #4 + blt 100f +- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) +- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) ++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #4 + blt 500f + 1000: +- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + subs WIDTH, WIDTH, #4 + bge 1000b + 500: +- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt + 100: + /****************************************************/ + .endif + /* handle the remaining trailing pixels */ + tst WIDTH, #2 + beq 200f +- bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt + 200: + tst WIDTH, #1 + beq 300f +- bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt + 300: + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -104] + mov sp, x29 +@@ -3551,17 +3551,17 @@ 300: + .unreq UX + .unreq WIDTH + .unreq TMP1 + .unreq TMP2 + .unreq PF_OFFS + .unreq TMP3 + .unreq TMP4 + .unreq STRIDE +-.endfunc ++pixman_end_asm_function + + .endm + + /*****************************************************************************/ + + .set have_bilinear_interpolate_four_pixels_8888_8888, 1 + + .macro bilinear_interpolate_four_pixels_8888_8888_head +diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h +--- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h ++++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h +@@ -75,340 +75,340 @@ + #define PREFETCH_MODE pldl1keep + + /* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + + .macro pixldst1 op, elem_size, reg1, mem_operand, abits +- op {v®1&.&elem_size}, [&mem_operand&], #8 ++ \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8 + .endm + + .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits +- op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16 + .endm + + .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits +- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32 + .endm + + .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes +- op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& ++ \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\() + .endm + + .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand +- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24 + .endm + + .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand +- op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 ++ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3 + .endm + + .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +-.if numbytes == 32 +- .if elem_size==32 +- pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +- .elseif elem_size==16 +- pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits ++.if \numbytes == 32 ++ .if \elem_size==32 ++ pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++ .elseif \elem_size==16 ++ pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits + .else +- pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits ++ pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits + .endif +-.elseif numbytes == 16 +- .if elem_size==32 +- pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits +- .elseif elem_size==16 +- pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits ++.elseif \numbytes == 16 ++ .if \elem_size==32 ++ pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits ++ .elseif \elem_size==16 ++ pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits + .else +- pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits ++ pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits + .endif +-.elseif numbytes == 8 +- .if elem_size==32 +- pixldst1 op, 2s, %(basereg+1), mem_operand, abits +- .elseif elem_size==16 +- pixldst1 op, 4h, %(basereg+1), mem_operand, abits ++.elseif \numbytes == 8 ++ .if \elem_size==32 ++ pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits ++ .elseif \elem_size==16 ++ pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits + .else +- pixldst1 op, 8b, %(basereg+1), mem_operand, abits ++ pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits + .endif +-.elseif numbytes == 4 +- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) +- pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 +- .elseif elem_size == 16 +- pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 +- pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 ++.elseif \numbytes == 4 ++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) ++ pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4 ++ .elseif \elem_size == 16 ++ pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2 ++ pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2 + .else +- pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1 + .endif +-.elseif numbytes == 2 +- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) +- pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 ++.elseif \numbytes == 2 ++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) ++ pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2 + .else +- pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 +- pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1 ++ pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1 + .endif +-.elseif numbytes == 1 +- pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 ++.elseif \numbytes == 1 ++ pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1 + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld numpix, bpp, basereg, mem_operand, abits=0 +-.if bpp > 0 +-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif (bpp == 24) && (numpix == 8) +- pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +-.elseif (bpp == 24) && (numpix == 4) +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +-.elseif (bpp == 24) && (numpix == 2) +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +-.elseif (bpp == 24) && (numpix == 1) +- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand ++.if \bpp > 0 ++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif (\bpp == 24) && (\numpix == 8) ++ pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand ++.elseif (\bpp == 24) && (\numpix == 4) ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 2) ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 1) ++ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand + .else +- pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits ++ pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits + .endif + .endif + .endm + + .macro pixst numpix, bpp, basereg, mem_operand, abits=0 +-.if bpp > 0 +-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ +- %(basereg+6), %(basereg+7), mem_operand, abits +-.elseif (bpp == 24) && (numpix == 8) +- pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +-.elseif (bpp == 24) && (numpix == 4) +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +-.elseif (bpp == 24) && (numpix == 2) +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +-.elseif (bpp == 24) && (numpix == 1) +- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +-.elseif numpix * bpp == 32 && abits == 32 +- pixldst 4, st1, 32, basereg, mem_operand, abits +-.elseif numpix * bpp == 16 && abits == 16 +- pixldst 2, st1, 16, basereg, mem_operand, abits ++.if \bpp > 0 ++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \ ++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits ++.elseif (\bpp == 24) && (\numpix == 8) ++ pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand ++.elseif (\bpp == 24) && (\numpix == 4) ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 2) ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand ++.elseif (\bpp == 24) && (\numpix == 1) ++ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand ++.elseif \numpix * \bpp == 32 && \abits == 32 ++ pixldst 4, st1, 32, \basereg, \mem_operand, \abits ++.elseif \numpix * \bpp == 16 && \abits == 16 ++ pixldst 2, st1, 16, \basereg, \mem_operand, \abits + .else +- pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits ++ pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits + .endif + .endif + .endm + + .macro pixld_a numpix, bpp, basereg, mem_operand +-.if (bpp * numpix) <= 128 +- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.if (\bpp * \numpix) <= 128 ++ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) + .else +- pixld numpix, bpp, basereg, mem_operand, 128 ++ pixld \numpix, \bpp, \basereg, \mem_operand, 128 + .endif + .endm + + .macro pixst_a numpix, bpp, basereg, mem_operand +-.if (bpp * numpix) <= 128 +- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.if (\bpp * \numpix) <= 128 ++ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) + .else +- pixst numpix, bpp, basereg, mem_operand, 128 ++ pixst \numpix, \bpp, \basereg, \mem_operand, 128 + .endif + .endm + + /* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ + .macro pixld1_s elem_size, reg1, mem_operand +-.if elem_size == 16 ++.if \elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #1 ++ add TMP1, \mem_operand, TMP1, lsl #1 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP2, mem_operand, TMP2, lsl #1 +- ld1 {v®1&.h}[0], [TMP1] ++ add TMP2, \mem_operand, TMP2, lsl #1 ++ ld1 {v\()\reg1\().h}[0], [TMP1] + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #1 +- ld1 {v®1&.h}[1], [TMP2] ++ add TMP1, \mem_operand, TMP1, lsl #1 ++ ld1 {v\()\reg1\().h}[1], [TMP2] + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP2, mem_operand, TMP2, lsl #1 +- ld1 {v®1&.h}[2], [TMP1] +- ld1 {v®1&.h}[3], [TMP2] +-.elseif elem_size == 32 ++ add TMP2, \mem_operand, TMP2, lsl #1 ++ ld1 {v\()\reg1\().h}[2], [TMP1] ++ ld1 {v\()\reg1\().h}[3], [TMP2] ++.elseif \elem_size == 32 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #2 ++ add TMP1, \mem_operand, TMP1, lsl #2 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP2, mem_operand, TMP2, lsl #2 +- ld1 {v®1&.s}[0], [TMP1] +- ld1 {v®1&.s}[1], [TMP2] ++ add TMP2, \mem_operand, TMP2, lsl #2 ++ ld1 {v\()\reg1\().s}[0], [TMP1] ++ ld1 {v\()\reg1\().s}[1], [TMP2] + .else + .error "unsupported" + .endif + .endm + + .macro pixld2_s elem_size, reg1, reg2, mem_operand +-.if 0 /* elem_size == 32 */ ++.if 0 /* \elem_size == 32 */ + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 +- add TMP1, mem_operand, TMP1, asl #2 ++ add TMP1, \mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X +- add TMP2, mem_operand, TMP2, asl #2 +- ld1 {v®1&.s}[0], [TMP1] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ ld1 {v\()\reg1\().s}[0], [TMP1] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 +- add TMP1, mem_operand, TMP1, asl #2 +- ld1 {v®2&.s}[0], [TMP2, :32] ++ add TMP1, \mem_operand, TMP1, asl #2 ++ ld1 {v\()\reg2\().s}[0], [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X +- add TMP2, mem_operand, TMP2, asl #2 +- ld1 {v®1&.s}[1], [TMP1] +- ld1 {v®2&.s}[1], [TMP2] ++ add TMP2, \mem_operand, TMP2, asl #2 ++ ld1 {v\()\reg1\().s}[1], [TMP1] ++ ld1 {v\()\reg2\().s}[1], [TMP2] + .else +- pixld1_s elem_size, reg1, mem_operand +- pixld1_s elem_size, reg2, mem_operand ++ pixld1_s \elem_size, \reg1, \mem_operand ++ pixld1_s \elem_size, \reg2, \mem_operand + .endif + .endm + + .macro pixld0_s elem_size, reg1, idx, mem_operand +-.if elem_size == 16 ++.if \elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #1 +- ld1 {v®1&.h}[idx], [TMP1] +-.elseif elem_size == 32 ++ add TMP1, \mem_operand, TMP1, lsl #1 ++ ld1 {v\()\reg1\().h}[\idx], [TMP1] ++.elseif \elem_size == 32 + asr DUMMY, VX, #16 + mov TMP1, DUMMY + adds VX, VX, UNIT_X + bmi 55f + 5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b + 55: +- add TMP1, mem_operand, TMP1, lsl #2 +- ld1 {v®1&.s}[idx], [TMP1] ++ add TMP1, \mem_operand, TMP1, lsl #2 ++ ld1 {v\()\reg1\().s}[\idx], [TMP1] + .endif + .endm + + .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +-.if numbytes == 32 +- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand +- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand +- pixdeinterleave elem_size, %(basereg+4) +-.elseif numbytes == 16 +- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +-.elseif numbytes == 8 +- pixld1_s elem_size, %(basereg+1), mem_operand +-.elseif numbytes == 4 +- .if elem_size == 32 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand +- .elseif elem_size == 16 +- pixld0_s elem_size, %(basereg+0), 2, mem_operand +- pixld0_s elem_size, %(basereg+0), 3, mem_operand ++.if \numbytes == 32 ++ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand ++ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand ++ pixdeinterleave \elem_size, %(\basereg+4) ++.elseif \numbytes == 16 ++ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand ++.elseif \numbytes == 8 ++ pixld1_s \elem_size, %(\basereg+1), \mem_operand ++.elseif \numbytes == 4 ++ .if \elem_size == 32 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand ++ .elseif \elem_size == 16 ++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand + .else +- pixld0_s elem_size, %(basereg+0), 4, mem_operand +- pixld0_s elem_size, %(basereg+0), 5, mem_operand +- pixld0_s elem_size, %(basereg+0), 6, mem_operand +- pixld0_s elem_size, %(basereg+0), 7, mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand + .endif +-.elseif numbytes == 2 +- .if elem_size == 16 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.elseif \numbytes == 2 ++ .if \elem_size == 16 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .else +- pixld0_s elem_size, %(basereg+0), 2, mem_operand +- pixld0_s elem_size, %(basereg+0), 3, mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand ++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand + .endif +-.elseif numbytes == 1 +- pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.elseif \numbytes == 1 ++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .else +- .error "unsupported size: numbytes" ++ .error "unsupported size: \numbytes" + .endif + .endm + + .macro pixld_s numpix, bpp, basereg, mem_operand +-.if bpp > 0 +- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand ++.if \bpp > 0 ++ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand + .endif + .endm + + .macro vuzp8 reg1, reg2 + umov DUMMY, v16.d[0] +- uzp1 v16.8b, v®1&.8b, v®2&.8b +- uzp2 v®2&.8b, v®1&.8b, v®2&.8b +- mov v®1&.8b, v16.8b ++ uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b ++ uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b ++ mov v\()\reg1\().8b, v16.8b + mov v16.d[0], DUMMY + .endm + + .macro vzip8 reg1, reg2 + umov DUMMY, v16.d[0] +- zip1 v16.8b, v®1&.8b, v®2&.8b +- zip2 v®2&.8b, v®1&.8b, v®2&.8b +- mov v®1&.8b, v16.8b ++ zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b ++ zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b ++ mov v\()\reg1\().8b, v16.8b + mov v16.d[0], DUMMY + .endm + + /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ + .macro pixdeinterleave bpp, basereg +-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- vuzp8 %(basereg+0), %(basereg+1) +- vuzp8 %(basereg+2), %(basereg+3) +- vuzp8 %(basereg+1), %(basereg+3) +- vuzp8 %(basereg+0), %(basereg+2) ++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vuzp8 %(\basereg+0), %(\basereg+1) ++ vuzp8 %(\basereg+2), %(\basereg+3) ++ vuzp8 %(\basereg+1), %(\basereg+3) ++ vuzp8 %(\basereg+0), %(\basereg+2) + .endif + .endm + + /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ + .macro pixinterleave bpp, basereg +-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) +- vzip8 %(basereg+0), %(basereg+2) +- vzip8 %(basereg+1), %(basereg+3) +- vzip8 %(basereg+2), %(basereg+3) +- vzip8 %(basereg+0), %(basereg+1) ++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vzip8 %(\basereg+0), %(\basereg+2) ++ vzip8 %(\basereg+1), %(\basereg+3) ++ vzip8 %(\basereg+2), %(\basereg+3) ++ vzip8 %(\basereg+0), %(\basereg+1) + .endif + .endm + + /* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is +@@ -432,62 +432,62 @@ 55: + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ + .macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) +- a x ++ \a \x + .endif + .endm + + .macro cache_preload std_increment, boost_increment + .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +-.if std_increment != 0 +- PF add PF_X, PF_X, #std_increment ++.if \std_increment != 0 ++ PF add, PF_X, PF_X, #\std_increment + .endif +- PF tst PF_CTL, #0xF +- PF beq 71f +- PF add PF_X, PF_X, #boost_increment +- PF sub PF_CTL, PF_CTL, #1 ++ PF tst, PF_CTL, #0xF ++ PF beq, 71f ++ PF add, PF_X, PF_X, #\boost_increment ++ PF sub, PF_CTL, PF_CTL, #1 + 71: +- PF cmp PF_X, ORIG_W ++ PF cmp, PF_X, ORIG_W + .if src_bpp_shift >= 0 +- PF lsl DUMMY, PF_X, #src_bpp_shift +- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl, DUMMY, PF_X, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] + .endif + .if dst_r_bpp != 0 +- PF lsl DUMMY, PF_X, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF lsl, DUMMY, PF_X, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] + .endif + .if mask_bpp_shift >= 0 +- PF lsl DUMMY, PF_X, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ PF lsl, DUMMY, PF_X, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] + .endif +- PF ble 71f +- PF sub PF_X, PF_X, ORIG_W +- PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble, 71f ++ PF sub, PF_X, PF_X, ORIG_W ++ PF subs, PF_CTL, PF_CTL, #0x10 + 71: +- PF ble 72f ++ PF ble, 72f + .if src_bpp_shift >= 0 +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF ldrsb DUMMY, [PF_SRC, DUMMY] +- PF add PF_SRC, PF_SRC, #1 ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb, DUMMY, [PF_SRC, DUMMY] ++ PF add, PF_SRC, PF_SRC, #1 + .endif + .if dst_r_bpp != 0 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF ldrsb DUMMY, [PF_DST, DUMMY] +- PF add PF_DST, PF_DST, #1 ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb, DUMMY, [PF_DST, DUMMY] ++ PF add, PF_DST, PF_DST, #1 + .endif + .if mask_bpp_shift >= 0 +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF ldrsb DUMMY, [PF_MASK, DUMMY] +- PF add PF_MASK, PF_MASK, #1 ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb, DUMMY, [PF_MASK, DUMMY] ++ PF add, PF_MASK, PF_MASK, #1 + .endif + 72: + .endif + .endm + + .macro cache_preload_simple + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) + .if src_bpp > 0 +@@ -516,56 +516,56 @@ 72: + process_pixblock_tail, \ + process_pixblock_tail_head + .if dst_w_bpp != 24 + tst DST_R, #0xF + beq 52f + + .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 + .irp lowbit, 1, 2, 4, 8, 16 +-local skip1 +-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +-.if lowbit < 16 /* we don't need more than 16-byte alignment */ +- tst DST_R, #lowbit ++ ++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if \lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_R, #\lowbit + beq 51f + .endif +- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC +- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK ++ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC ++ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK + .if dst_r_bpp > 0 +- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R ++ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R + .else +- add DST_R, DST_R, #lowbit ++ add DST_R, DST_R, #\lowbit + .endif +- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) +- sub W, W, #(lowbit * 8 / dst_w_bpp) ++ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) ++ sub W, W, #(\lowbit * 8 / dst_w_bpp) + 51: + .endif + .endr + .endif + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + +- process_pixblock_head ++ \process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple +- process_pixblock_tail ++ \process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg + + .irp lowbit, 1, 2, 4, 8, 16 +-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +-.if lowbit < 16 /* we don't need more than 16-byte alignment */ +- tst DST_W, #lowbit ++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if \lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_W, #\lowbit + beq 51f + .endif + .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 +- sub W, W, #(lowbit * 8 / dst_w_bpp) ++ sub W, W, #(\lowbit * 8 / dst_w_bpp) + .endif +- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W ++ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W + 51: + .endif + .endr + .endif + 52: + .endm + + /* +@@ -587,52 +587,52 @@ 52: + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 52f + .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 + .irp chunk_size, 16, 8, 4, 2, 1 +-.if pixblock_size > chunk_size +- tst W, #chunk_size ++.if pixblock_size > \chunk_size ++ tst W, #\chunk_size + beq 51f +- pixld_src chunk_size, src_bpp, src_basereg, SRC +- pixld chunk_size, mask_bpp, mask_basereg, MASK +-.if dst_aligned_flag != 0 +- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++ pixld_src \chunk_size, src_bpp, src_basereg, SRC ++ pixld \chunk_size, mask_bpp, mask_basereg, MASK ++.if \dst_aligned_flag != 0 ++ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R + .else +- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R + .endif +-.if cache_preload_flag != 0 +- PF add PF_X, PF_X, #chunk_size ++.if \cache_preload_flag != 0 ++ PF add, PF_X, PF_X, #\chunk_size + .endif + 51: + .endif + .endr + .endif + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + +- process_pixblock_head +-.if cache_preload_flag != 0 ++ \process_pixblock_head ++.if \cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple + .endif +- process_pixblock_tail ++ \process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg + .irp chunk_size, 16, 8, 4, 2, 1 +-.if pixblock_size > chunk_size +- tst W, #chunk_size ++.if pixblock_size > \chunk_size ++ tst W, #\chunk_size + beq 51f +-.if dst_aligned_flag != 0 +- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++.if \dst_aligned_flag != 0 ++ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W + .else +- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W + .endif + 51: + .endif + .endr + 52: + .endm + + /* +@@ -655,17 +655,17 @@ 52: + .if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift + .endif + .if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift + .endif + subs H, H, #1 + mov DST_R, DST_W +- bge start_of_loop_label ++ bge \start_of_loop_label + .endm + + /* + * Registers are allocated in the following way by default: + * v0, v1, v2, v3 - reserved for loading source pixel data + * v4, v5, v6, v7 - reserved for loading destination pixel data + * v24, v25, v26, v27 - reserved for loading mask pixel data + * v28, v29, v30, v31 - final destination pixel data for writeback to memory +@@ -682,17 +682,17 @@ 52: + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + +- pixman_asm_function fname ++ pixman_asm_function \fname + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 232 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] +@@ -707,38 +707,38 @@ 52: + str x28, [x29, -232] + + /* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +-.if prefetch_distance == 0 ++.if \prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ +- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) ++ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE + .endif + + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set pixblock_size, pixblock_size_ +- .set dst_w_basereg, dst_w_basereg_ +- .set dst_r_basereg, dst_r_basereg_ +- .set src_basereg, src_basereg_ +- .set mask_basereg, mask_basereg_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set pixblock_size, \pixblock_size_ ++ .set dst_w_basereg, \dst_w_basereg_ ++ .set dst_r_basereg, \dst_r_basereg_ ++ .set src_basereg, \src_basereg_ ++ .set mask_basereg, \mask_basereg_ + + .macro pixld_src x:vararg +- pixld x ++ pixld \x + .endm + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + /* + * Assign symbolic names to registers + */ +@@ -805,32 +805,32 @@ 52: + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 + .else + .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + +-.if (((flags) & FLAG_DST_READWRITE) != 0) ++.if (((\flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif +-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 + .else + .set DEINTERLEAVE_32BPP_ENABLED, 0 + .endif + +-.if prefetch_distance < 0 || prefetch_distance > 15 +- .error "invalid prefetch distance (prefetch_distance)" ++.if \prefetch_distance < 0 || \prefetch_distance > 15 ++ .error "invalid prefetch distance (\prefetch_distance)" + .endif + +- PF mov PF_X, #0 ++ PF mov, PF_X, #0 + mov DST_R, DST_W + + .if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 + .endif + .if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W +@@ -839,71 +839,71 @@ 52: + .if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 + .endif + + /* + * Setup advanced prefetcher initial state + */ +- PF mov PF_SRC, SRC +- PF mov PF_DST, DST_R +- PF mov PF_MASK, MASK +- /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ +- PF lsl DUMMY, H, #4 +- PF mov PF_CTL, DUMMY +- PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) ++ PF mov, PF_SRC, SRC ++ PF mov, PF_DST, DST_R ++ PF mov, PF_MASK, MASK ++ /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */ ++ PF lsl, DUMMY, H, #4 ++ PF mov, PF_CTL, DUMMY ++ PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10) + +- init ++ \init + subs H, H, #1 + mov ORIG_W, W + blt 9f + cmp W, #(pixblock_size * 2) + blt 800f + /* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ + 0: +- ensure_destination_ptr_alignment process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ ensure_destination_ptr_alignment \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- PF add PF_X, PF_X, #pixblock_size +- process_pixblock_head ++ PF add, PF_X, PF_X, #pixblock_size ++ \process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 200f + + 100: +- process_pixblock_tail_head ++ \process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 100b + + 200: +- process_pixblock_tail ++ \process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + advance_to_next_scanline 0b + +- cleanup ++ \cleanup + 1000: + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] +@@ -920,48 +920,48 @@ 1000: + ret /* exit */ + /* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ + 800: + .if src_bpp_shift >= 0 +- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift +- PF prfm PREFETCH_MODE, [SRC, DUMMY] ++ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF prfm, PREFETCH_MODE, [SRC, DUMMY] + .endif + .if dst_r_bpp != 0 +- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift +- PF prfm PREFETCH_MODE, [DST_R, DUMMY] ++ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF prfm, PREFETCH_MODE, [DST_R, DUMMY] + .endif + .if mask_bpp_shift >= 0 +- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift +- PF prfm PREFETCH_MODE, [MASK, DUMMY] ++ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF prfm, PREFETCH_MODE, [MASK, DUMMY] + .endif + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 100f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- process_pixblock_head +- process_pixblock_tail ++ \process_pixblock_head ++ \process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + 100: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + advance_to_next_scanline 800b + 9: +- cleanup ++ \cleanup + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] +@@ -990,17 +990,17 @@ 9: + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY +- .endfunc ++ pixman_end_asm_function + .endm + + /* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ + .macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ +@@ -1014,50 +1014,50 @@ 9: + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + +- pixman_asm_function fname ++ pixman_asm_function \fname + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + + /* + * Make some macro arguments globally visible and accessible + * from other macros + */ +- .set src_bpp, src_bpp_ +- .set mask_bpp, mask_bpp_ +- .set dst_w_bpp, dst_w_bpp_ +- .set pixblock_size, pixblock_size_ +- .set dst_w_basereg, dst_w_basereg_ +- .set dst_r_basereg, dst_r_basereg_ +- .set src_basereg, src_basereg_ +- .set mask_basereg, mask_basereg_ ++ .set src_bpp, \src_bpp_ ++ .set mask_bpp, \mask_bpp_ ++ .set dst_w_bpp, \dst_w_bpp_ ++ .set pixblock_size, \pixblock_size_ ++ .set dst_w_basereg, \dst_w_basereg_ ++ .set dst_r_basereg, \dst_r_basereg_ ++ .set src_basereg, \src_basereg_ ++ .set mask_basereg, \mask_basereg_ + +-.if use_nearest_scaling != 0 ++.if \use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req x0 + DST_W .req x1 + SRC .req x2 + VX .req x3 + UNIT_X .req x4 + SRC_WIDTH_FIXED .req x5 + MASK .req x6 + TMP1 .req x8 + TMP2 .req x9 + DST_R .req x10 + DUMMY .req x30 + + .macro pixld_src x:vararg +- pixld_s x ++ pixld_s \x + .endm + + sxtw x0, w0 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + + stp x29, x30, [sp, -16]! +@@ -1075,84 +1075,84 @@ 9: + W .req x0 /* width (is updated during processing) */ + DST_W .req x1 /* destination buffer pointer for writes */ + SRC .req x2 /* source buffer pointer */ + MASK .req x3 /* mask pointer */ + DST_R .req x4 /* destination buffer pointer for reads */ + DUMMY .req x30 + + .macro pixld_src x:vararg +- pixld x ++ pixld \x + .endm + + sxtw x0, w0 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 64 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + .endif + +-.if (((flags) & FLAG_DST_READWRITE) != 0) ++.if (((\flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif +-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 + .else + .set DEINTERLEAVE_32BPP_ENABLED, 0 + .endif + + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + +- init ++ \init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 800f + +- ensure_destination_ptr_alignment process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ ensure_destination_ptr_alignment \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 700f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +- process_pixblock_head ++ \process_pixblock_head + subs W, W, #pixblock_size + blt 200f + 100: +- process_pixblock_tail_head ++ \process_pixblock_tail_head + subs W, W, #pixblock_size + bge 100b + 200: +- process_pixblock_tail ++ \process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + 700: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + +- cleanup +-.if use_nearest_scaling != 0 ++ \cleanup ++.if \use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -96] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +@@ -1162,22 +1162,22 @@ 700: + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + .endif + 800: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ +- process_pixblock_head, \ +- process_pixblock_tail, \ +- process_pixblock_tail_head ++ \process_pixblock_head, \ ++ \process_pixblock_tail, \ ++ \process_pixblock_tail_head + +- cleanup +-.if use_nearest_scaling != 0 ++ \cleanup ++.if \use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -88] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +@@ -1208,25 +1208,25 @@ 800: + .unreq DST_R + .unreq DST_W + .unreq W + .endif + + .purgem fetch_src_pixblock + .purgem pixld_src + +- .endfunc ++ pixman_end_asm_function + .endm + + .macro generate_composite_function_single_scanline x:vararg +- generate_composite_function_scanline 0, x ++ generate_composite_function_scanline 0, \x + .endm + + .macro generate_composite_function_nearest_scanline x:vararg +- generate_composite_function_scanline 1, x ++ generate_composite_function_scanline 1, \x + .endm + + /* Default prologue/epilogue, nothing special needs to be done */ + + .macro default_init + .endm + + .macro default_cleanup +@@ -1250,61 +1250,61 @@ 800: + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) + * into a planar a8r8g8b8 format (with a, r, g, b color components + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). + * + * Warning: the conversion is destructive and the original + * value (in) is lost. + */ + .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b +- shrn &out_r&.8b, &in&.8h, #8 +- shrn &out_g&.8b, &in&.8h, #3 +- sli &in&.8h, &in&.8h, #5 +- movi &out_a&.8b, #255 +- sri &out_r&.8b, &out_r&.8b, #5 +- sri &out_g&.8b, &out_g&.8b, #6 +- shrn &out_b&.8b, &in&.8h, #2 ++ shrn \()\out_r\().8b, \()\in\().8h, #8 ++ shrn \()\out_g\().8b, \()\in\().8h, #3 ++ sli \()\in\().8h, \()\in\().8h, #5 ++ movi \()\out_a\().8b, #255 ++ sri \()\out_r\().8b, \()\out_r\().8b, #5 ++ sri \()\out_g\().8b, \()\out_g\().8b, #6 ++ shrn \()\out_b\().8b, \()\in\().8h, #2 + .endm + + .macro convert_0565_to_x888 in, out_r, out_g, out_b +- shrn &out_r&.8b, &in&.8h, #8 +- shrn &out_g&.8b, &in&.8h, #3 +- sli &in&.8h, &in&.8h, #5 +- sri &out_r&.8b, &out_r&.8b, #5 +- sri &out_g&.8b, &out_g&.8b, #6 +- shrn &out_b&.8b, &in&.8h, #2 ++ shrn \()\out_r\().8b, \()\in\().8h, #8 ++ shrn \()\out_g\().8b, \()\in\().8h, #3 ++ sli \()\in\().8h, \()\in\().8h, #5 ++ sri \()\out_r\().8b, \()\out_r\().8b, #5 ++ sri \()\out_g\().8b, \()\out_g\().8b, #6 ++ shrn \()\out_b\().8b, \()\in\().8h, #2 + .endm + + /* + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit + * registers (tmp1, tmp2) + */ + .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 +- ushll &tmp1&.8h, &in_g&.8b, #7 +- shl &tmp1&.8h, &tmp1&.8h, #1 +- ushll &out&.8h, &in_r&.8b, #7 +- shl &out&.8h, &out&.8h, #1 +- ushll &tmp2&.8h, &in_b&.8b, #7 +- shl &tmp2&.8h, &tmp2&.8h, #1 +- sri &out&.8h, &tmp1&.8h, #5 +- sri &out&.8h, &tmp2&.8h, #11 ++ ushll \()\tmp1\().8h, \()\in_g\().8b, #7 ++ shl \()\tmp1\().8h, \()\tmp1\().8h, #1 ++ ushll \()\out\().8h, \()\in_r\().8b, #7 ++ shl \()\out\().8h, \()\out\().8h, #1 ++ ushll \()\tmp2\().8h, \()\in_b\().8b, #7 ++ shl \()\tmp2\().8h, \()\tmp2\().8h, #1 ++ sri \()\out\().8h, \()\tmp1\().8h, #5 ++ sri \()\out\().8h, \()\tmp2\().8h, #11 + .endm + + /* + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels + * returned in (out0, out1) registers pair. Requires one temporary + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original + * value from 'in' is lost + */ + .macro convert_four_0565_to_x888_packed in, out0, out1, tmp +- shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ +- shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ +- sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ +- sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ +- sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ +- ushr &out1&.4h, &in&.4h, #8 /* R is in place */ +- sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ +- zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ +- zip2 &out1&.4h, &out0&.4h, &out1&.4h +- mov &out0&.d[0], &tmp&.d[0] ++ shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */ ++ shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */ ++ sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */ ++ sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */ ++ sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */ ++ ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */ ++ sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */ ++ zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */ ++ zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h ++ mov \()\out0\().d[0], \()\tmp\().d[0] + .endm |