diff options
Diffstat (limited to 'gfx/cairo/pixman-arm32-clang.patch')
-rw-r--r-- | gfx/cairo/pixman-arm32-clang.patch | 5205 |
1 files changed, 0 insertions, 5205 deletions
diff --git a/gfx/cairo/pixman-arm32-clang.patch b/gfx/cairo/pixman-arm32-clang.patch deleted file mode 100644 index cd9d61e470..0000000000 --- a/gfx/cairo/pixman-arm32-clang.patch +++ /dev/null @@ -1,5205 +0,0 @@ -https://gitlab.freedesktop.org/pixman/pixman/-/issues/74 - -diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S ---- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S -+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S -@@ -77,206 +77,206 @@ - * format conversion, and interpolation as separate macros which can be used - * as the basic building blocks for constructing bilinear scanline functions. - */ - - .macro bilinear_load_8888 reg1, reg2, tmp - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 -- vld1.32 {reg1}, [TMP1], STRIDE -- vld1.32 {reg2}, [TMP1] -+ vld1.32 {\reg1}, [TMP1], STRIDE -+ vld1.32 {\reg2}, [TMP1] - .endm - - .macro bilinear_load_0565 reg1, reg2, tmp - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 -- vld1.32 {reg2[0]}, [TMP1], STRIDE -- vld1.32 {reg2[1]}, [TMP1] -- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp -+ vld1.32 {\reg2[0]}, [TMP1], STRIDE -+ vld1.32 {\reg2[1]}, [TMP1] -+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp - .endm - - .macro bilinear_load_and_vertical_interpolate_two_8888 \ - acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - -- bilinear_load_8888 reg1, reg2, tmp1 -- vmull.u8 acc1, reg1, d28 -- vmlal.u8 acc1, reg2, d29 -- bilinear_load_8888 reg3, reg4, tmp2 -- vmull.u8 acc2, reg3, d28 -- vmlal.u8 acc2, reg4, d29 -+ bilinear_load_8888 \reg1, \reg2, \tmp1 -+ vmull.u8 \acc1, \reg1, d28 -+ vmlal.u8 \acc1, \reg2, d29 -+ bilinear_load_8888 \reg3, \reg4, \tmp2 -+ vmull.u8 \acc2, \reg3, d28 -+ vmlal.u8 \acc2, \reg4, d29 - .endm - - .macro bilinear_load_and_vertical_interpolate_four_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - bilinear_load_and_vertical_interpolate_two_8888 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi -+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi - bilinear_load_and_vertical_interpolate_two_8888 \ -- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi -+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi - .endm - - .macro bilinear_load_and_vertical_interpolate_two_0565 \ - acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi - - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 - mov TMP2, X, asr #16 - add X, X, UX - add TMP2, TOP, TMP2, asl #1 -- vld1.32 {acc2lo[0]}, [TMP1], STRIDE -- vld1.32 {acc2hi[0]}, [TMP2], STRIDE -- vld1.32 {acc2lo[1]}, [TMP1] -- vld1.32 {acc2hi[1]}, [TMP2] -- convert_0565_to_x888 acc2, reg3, reg2, reg1 -- vzip.u8 reg1, reg3 -- vzip.u8 reg2, reg4 -- vzip.u8 reg3, reg4 -- vzip.u8 reg1, reg2 -- vmull.u8 acc1, reg1, d28 -- vmlal.u8 acc1, reg2, d29 -- vmull.u8 acc2, reg3, d28 -- vmlal.u8 acc2, reg4, d29 -+ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE -+ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE -+ vld1.32 {\acc2lo[1]}, [TMP1] -+ vld1.32 {\acc2hi[1]}, [TMP2] -+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 -+ vzip.u8 \reg1, \reg3 -+ vzip.u8 \reg2, \reg4 -+ vzip.u8 \reg3, \reg4 -+ vzip.u8 \reg1, \reg2 -+ vmull.u8 \acc1, \reg1, d28 -+ vmlal.u8 \acc1, \reg2, d29 -+ vmull.u8 \acc2, \reg3, d28 -+ vmlal.u8 \acc2, \reg4, d29 - .endm - - .macro bilinear_load_and_vertical_interpolate_four_0565 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 - mov TMP2, X, asr #16 - add X, X, UX - add TMP2, TOP, TMP2, asl #1 -- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE -- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE -- vld1.32 {xacc2lo[1]}, [TMP1] -- vld1.32 {xacc2hi[1]}, [TMP2] -- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 -+ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE -+ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE -+ vld1.32 {\xacc2lo[1]}, [TMP1] -+ vld1.32 {\xacc2hi[1]}, [TMP2] -+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 - mov TMP2, X, asr #16 - add X, X, UX - add TMP2, TOP, TMP2, asl #1 -- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE -- vzip.u8 xreg1, xreg3 -- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE -- vzip.u8 xreg2, xreg4 -- vld1.32 {yacc2lo[1]}, [TMP1] -- vzip.u8 xreg3, xreg4 -- vld1.32 {yacc2hi[1]}, [TMP2] -- vzip.u8 xreg1, xreg2 -- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 -- vmull.u8 xacc1, xreg1, d28 -- vzip.u8 yreg1, yreg3 -- vmlal.u8 xacc1, xreg2, d29 -- vzip.u8 yreg2, yreg4 -- vmull.u8 xacc2, xreg3, d28 -- vzip.u8 yreg3, yreg4 -- vmlal.u8 xacc2, xreg4, d29 -- vzip.u8 yreg1, yreg2 -- vmull.u8 yacc1, yreg1, d28 -- vmlal.u8 yacc1, yreg2, d29 -- vmull.u8 yacc2, yreg3, d28 -- vmlal.u8 yacc2, yreg4, d29 -+ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE -+ vzip.u8 \xreg1, \xreg3 -+ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE -+ vzip.u8 \xreg2, \xreg4 -+ vld1.32 {\yacc2lo[1]}, [TMP1] -+ vzip.u8 \xreg3, \xreg4 -+ vld1.32 {\yacc2hi[1]}, [TMP2] -+ vzip.u8 \xreg1, \xreg2 -+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 -+ vmull.u8 \xacc1, \xreg1, d28 -+ vzip.u8 \yreg1, \yreg3 -+ vmlal.u8 \xacc1, \xreg2, d29 -+ vzip.u8 \yreg2, \yreg4 -+ vmull.u8 \xacc2, \xreg3, d28 -+ vzip.u8 \yreg3, \yreg4 -+ vmlal.u8 \xacc2, \xreg4, d29 -+ vzip.u8 \yreg1, \yreg2 -+ vmull.u8 \yacc1, \yreg1, d28 -+ vmlal.u8 \yacc1, \yreg2, d29 -+ vmull.u8 \yacc2, \yreg3, d28 -+ vmlal.u8 \yacc2, \yreg4, d29 - .endm - - .macro bilinear_store_8888 numpix, tmp1, tmp2 --.if numpix == 4 -+.if \numpix == 4 - vst1.32 {d0, d1}, [OUT]! --.elseif numpix == 2 -+.elseif \numpix == 2 - vst1.32 {d0}, [OUT]! --.elseif numpix == 1 -+.elseif \numpix == 1 - vst1.32 {d0[0]}, [OUT, :32]! - .else - .error bilinear_store_8888 numpix is unsupported - .endif - .endm - - .macro bilinear_store_0565 numpix, tmp1, tmp2 - vuzp.u8 d0, d1 - vuzp.u8 d2, d3 - vuzp.u8 d1, d3 - vuzp.u8 d0, d2 -- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 --.if numpix == 4 -+ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 -+.if \numpix == 4 - vst1.16 {d2}, [OUT]! --.elseif numpix == 2 -+.elseif \numpix == 2 - vst1.32 {d2[0]}, [OUT]! --.elseif numpix == 1 -+.elseif \numpix == 1 - vst1.16 {d2[0]}, [OUT]! - .else - .error bilinear_store_0565 numpix is unsupported - .endif - .endm - - - /* - * Macros for loading mask pixels into register 'mask'. - * vdup must be done in somewhere else. - */ - .macro bilinear_load_mask_x numpix, mask - .endm - - .macro bilinear_load_mask_8 numpix, mask --.if numpix == 4 -- vld1.32 {mask[0]}, [MASK]! --.elseif numpix == 2 -- vld1.16 {mask[0]}, [MASK]! --.elseif numpix == 1 -- vld1.8 {mask[0]}, [MASK]! -+.if \numpix == 4 -+ vld1.32 {\mask[0]}, [MASK]! -+.elseif \numpix == 2 -+ vld1.16 {\mask[0]}, [MASK]! -+.elseif \numpix == 1 -+ vld1.8 {\mask[0]}, [MASK]! - .else -- .error bilinear_load_mask_8 numpix is unsupported -+ .error bilinear_load_mask_8 \numpix is unsupported - .endif - pld [MASK, #prefetch_offset] - .endm - - .macro bilinear_load_mask mask_fmt, numpix, mask -- bilinear_load_mask_&mask_fmt numpix, mask -+ bilinear_load_mask_\()\mask_fmt \numpix, \mask - .endm - - - /* - * Macros for loading destination pixels into register 'dst0' and 'dst1'. - * Interleave should be done somewhere else. - */ - .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 --.if numpix == 4 -- vld1.32 {dst0, dst1}, [OUT] --.elseif numpix == 2 -- vld1.32 {dst0}, [OUT] --.elseif numpix == 1 -- vld1.32 {dst0[0]}, [OUT] -+.if \numpix == 4 -+ vld1.32 {\dst0, \dst1}, [OUT] -+.elseif \numpix == 2 -+ vld1.32 {\dst0}, [OUT] -+.elseif \numpix == 1 -+ vld1.32 {\dst0[0]}, [OUT] - .else -- .error bilinear_load_dst_8888 numpix is unsupported -+ .error bilinear_load_dst_8888 \numpix is unsupported - .endif - pld [OUT, #(prefetch_offset * 4)] - .endm - - .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 -- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 -+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 -- bilinear_load_dst_8888 numpix, dst0, dst1, dst01 -+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 -- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 -+ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 - .endm - - /* - * Macros for duplicating partially loaded mask to fill entire register. - * We will apply mask to interleaved source pixels, that is - * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) - * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) - * So, we need to duplicate loaded mask into whole register. -@@ -285,79 +285,79 @@ - * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) - * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) - * We can do some optimizations for this including last pixel cases. - */ - .macro bilinear_duplicate_mask_x numpix, mask - .endm - - .macro bilinear_duplicate_mask_8 numpix, mask --.if numpix == 4 -- vdup.32 mask, mask[0] --.elseif numpix == 2 -- vdup.16 mask, mask[0] --.elseif numpix == 1 -- vdup.8 mask, mask[0] -+.if \numpix == 4 -+ vdup.32 \mask, \mask[0] -+.elseif \numpix == 2 -+ vdup.16 \mask, \mask[0] -+.elseif \numpix == 1 -+ vdup.8 \mask, \mask[0] - .else - .error bilinear_duplicate_mask_8 is unsupported - .endif - .endm - - .macro bilinear_duplicate_mask mask_fmt, numpix, mask -- bilinear_duplicate_mask_&mask_fmt numpix, mask -+ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask - .endm - - /* - * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. - * Interleave should be done when maks is enabled or operator is 'over'. - */ - .macro bilinear_interleave src0, src1, dst0, dst1 -- vuzp.8 src0, src1 -- vuzp.8 dst0, dst1 -- vuzp.8 src0, src1 -- vuzp.8 dst0, dst1 -+ vuzp.8 \src0, \src1 -+ vuzp.8 \dst0, \dst1 -+ vuzp.8 \src0, \src1 -+ vuzp.8 \dst0, \dst1 - .endm - - .macro bilinear_interleave_src_dst_x_src \ - numpix, src0, src1, src01, dst0, dst1, dst01 - .endm - - .macro bilinear_interleave_src_dst_x_over \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, dst0, dst1 -+ bilinear_interleave \src0, \src1, \dst0, \dst1 - .endm - - .macro bilinear_interleave_src_dst_x_add \ - numpix, src0, src1, src01, dst0, dst1, dst01 - .endm - - .macro bilinear_interleave_src_dst_8_src \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, dst0, dst1 -+ bilinear_interleave \src0, \src1, \dst0, \dst1 - .endm - - .macro bilinear_interleave_src_dst_8_over \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, dst0, dst1 -+ bilinear_interleave \src0, \src1, \dst0, \dst1 - .endm - - .macro bilinear_interleave_src_dst_8_add \ - numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave src0, src1, dst0, dst1 -+ bilinear_interleave \src0, \src1, \dst0, \dst1 - .endm - - .macro bilinear_interleave_src_dst \ - mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 - -- bilinear_interleave_src_dst_&mask_fmt&_&op \ -- numpix, src0, src1, src01, dst0, dst1, dst01 -+ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ -+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 - .endm - - - /* - * Macros for applying masks to src pixels. (see combine_mask_u() function) - * src, dst should be in interleaved form. - * mask register should be in form (m0, m1, m2, m3). - */ -@@ -365,217 +365,217 @@ - numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 - .endm - - .macro bilinear_apply_mask_to_src_8 \ - numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 - -- vmull.u8 tmp01, src0, mask -- vmull.u8 tmp23, src1, mask -+ vmull.u8 \tmp01, \src0, \mask -+ vmull.u8 \tmp23, \src1, \mask - /* bubbles */ -- vrshr.u16 tmp45, tmp01, #8 -- vrshr.u16 tmp67, tmp23, #8 -+ vrshr.u16 \tmp45, \tmp01, #8 -+ vrshr.u16 \tmp67, \tmp23, #8 - /* bubbles */ -- vraddhn.u16 src0, tmp45, tmp01 -- vraddhn.u16 src1, tmp67, tmp23 -+ vraddhn.u16 \src0, \tmp45, \tmp01 -+ vraddhn.u16 \src1, \tmp67, \tmp23 - .endm - - .macro bilinear_apply_mask_to_src \ - mask_fmt, numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 - -- bilinear_apply_mask_to_src_&mask_fmt \ -- numpix, src0, src1, src01, mask, \ -- tmp01, tmp23, tmp45, tmp67 -+ bilinear_apply_mask_to_src_\()\mask_fmt \ -+ \numpix, \src0, \src1, \src01, \mask, \ -+ \tmp01, \tmp23, \tmp45, \tmp67 - .endm - - - /* - * Macros for combining src and destination pixels. - * Interleave or not is depending on operator 'op'. - */ - .macro bilinear_combine_src \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - .endm - - .macro bilinear_combine_over \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - -- vdup.32 tmp8, src1[1] -+ vdup.32 \tmp8, \src1[1] - /* bubbles */ -- vmvn.8 tmp8, tmp8 -+ vmvn.8 \tmp8, \tmp8 - /* bubbles */ -- vmull.u8 tmp01, dst0, tmp8 -+ vmull.u8 \tmp01, \dst0, \tmp8 - /* bubbles */ -- vmull.u8 tmp23, dst1, tmp8 -+ vmull.u8 \tmp23, \dst1, \tmp8 - /* bubbles */ -- vrshr.u16 tmp45, tmp01, #8 -- vrshr.u16 tmp67, tmp23, #8 -+ vrshr.u16 \tmp45, \tmp01, #8 -+ vrshr.u16 \tmp67, \tmp23, #8 - /* bubbles */ -- vraddhn.u16 dst0, tmp45, tmp01 -- vraddhn.u16 dst1, tmp67, tmp23 -+ vraddhn.u16 \dst0, \tmp45, \tmp01 -+ vraddhn.u16 \dst1, \tmp67, \tmp23 - /* bubbles */ -- vqadd.u8 src01, dst01, src01 -+ vqadd.u8 \src01, \dst01, \src01 - .endm - - .macro bilinear_combine_add \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - -- vqadd.u8 src01, dst01, src01 -+ vqadd.u8 \src01, \dst01, \src01 - .endm - - .macro bilinear_combine \ - op, numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 - -- bilinear_combine_&op \ -- numpix, src0, src1, src01, dst0, dst1, dst01, \ -- tmp01, tmp23, tmp45, tmp67, tmp8 -+ bilinear_combine_\()\op \ -+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ -+ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 - .endm - - /* - * Macros for final deinterleaving of destination pixels if needed. - */ - .macro bilinear_deinterleave numpix, dst0, dst1, dst01 -- vuzp.8 dst0, dst1 -+ vuzp.8 \dst0, \dst1 - /* bubbles */ -- vuzp.8 dst0, dst1 -+ vuzp.8 \dst0, \dst1 - .endm - - .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 - .endm - - .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 -- bilinear_deinterleave numpix, dst0, dst1, dst01 -+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 - .endm - - .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 -- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 -+ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 - .endm - - - .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op -- bilinear_load_&src_fmt d0, d1, d2 -- bilinear_load_mask mask_fmt, 1, d4 -- bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 -+ bilinear_load_\()\src_fmt d0, d1, d2 -+ bilinear_load_mask \mask_fmt, 1, d4 -+ bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9 - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 - /* 5 cycles bubble */ - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - /* 5 cycles bubble */ -- bilinear_duplicate_mask mask_fmt, 1, d4 -+ bilinear_duplicate_mask \mask_fmt, 1, d4 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - /* 3 cycles bubble */ - vmovn.u16 d0, q0 - /* 1 cycle bubble */ - bilinear_interleave_src_dst \ -- mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 -+ \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9 - bilinear_apply_mask_to_src \ -- mask_fmt, 1, d0, d1, q0, d4, \ -+ \mask_fmt, 1, d0, d1, q0, d4, \ - q3, q8, q10, q11 - bilinear_combine \ -- op, 1, d0, d1, q0, d18, d19, q9, \ -+ \op, 1, d0, d1, q0, d18, d19, q9, \ - q3, q8, q10, q11, d5 -- bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 -- bilinear_store_&dst_fmt 1, q2, q3 -+ bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0 -+ bilinear_store_\()\dst_fmt 1, q2, q3 - .endm - - .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op -- bilinear_load_and_vertical_interpolate_two_&src_fmt \ -+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 -- bilinear_load_mask mask_fmt, 2, d4 -- bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 -+ bilinear_load_mask \mask_fmt, 2, d4 -+ bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9 - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) -- bilinear_duplicate_mask mask_fmt, 2, d4 -+ bilinear_duplicate_mask \mask_fmt, 2, d4 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vmovn.u16 d0, q0 - bilinear_interleave_src_dst \ -- mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 -+ \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9 - bilinear_apply_mask_to_src \ -- mask_fmt, 2, d0, d1, q0, d4, \ -+ \mask_fmt, 2, d0, d1, q0, d4, \ - q3, q8, q10, q11 - bilinear_combine \ -- op, 2, d0, d1, q0, d18, d19, q9, \ -+ \op, 2, d0, d1, q0, d18, d19, q9, \ - q3, q8, q10, q11, d5 -- bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 -- bilinear_store_&dst_fmt 2, q2, q3 -+ bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0 -+ bilinear_store_\()\dst_fmt 2, q2, q3 - .endm - - .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op -- bilinear_load_and_vertical_interpolate_four_&src_fmt \ -+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 \ - q3, q9, d4, d5, d16, d17, d18, d19 - pld [TMP1, PF_OFFS] - sub TMP1, TMP1, STRIDE - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q2, d6, d30 - vmlal.u16 q2, d7, d30 - vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS -- bilinear_load_mask mask_fmt, 4, d22 -- bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 -+ bilinear_load_mask \mask_fmt, 4, d22 -+ bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1 - pld [TMP1, PF_OFFS] - vmlsl.u16 q8, d18, d31 - vmlal.u16 q8, d19, d31 - vadd.u16 q12, q12, q13 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) -- bilinear_duplicate_mask mask_fmt, 4, d22 -+ bilinear_duplicate_mask \mask_fmt, 4, d22 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d0, q0 - vmovn.u16 d1, q2 - vadd.u16 q12, q12, q13 - bilinear_interleave_src_dst \ -- mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 -+ \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1 - bilinear_apply_mask_to_src \ -- mask_fmt, 4, d0, d1, q0, d22, \ -+ \mask_fmt, 4, d0, d1, q0, d22, \ - q3, q8, q9, q10 - bilinear_combine \ -- op, 4, d0, d1, q0, d2, d3, q1, \ -+ \op, 4, d0, d1, q0, d2, d3, q1, \ - q3, q8, q9, q10, d23 -- bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 -- bilinear_store_&dst_fmt 4, q2, q3 -+ bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0 -+ bilinear_store_\()\dst_fmt 4, q2, q3 - .endm - - .set BILINEAR_FLAG_USE_MASK, 1 - .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 - - /* - * Main template macro for generating NEON optimized bilinear scanline functions. - * -@@ -605,24 +605,24 @@ - bilinear_process_four_pixels, \ - bilinear_process_pixblock_head, \ - bilinear_process_pixblock_tail, \ - bilinear_process_pixblock_tail_head, \ - pixblock_size, \ - prefetch_distance, \ - flags - --pixman_asm_function fname --.if pixblock_size == 8 --.elseif pixblock_size == 4 -+pixman_asm_function \fname -+.if \pixblock_size == 8 -+.elseif \pixblock_size == 4 - .else - .error unsupported pixblock size - .endif - --.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 -+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 - OUT .req r0 - TOP .req r1 - BOTTOM .req r2 - WT .req r3 - WB .req r4 - X .req r5 - UX .req r6 - WIDTH .req ip -@@ -630,17 +630,17 @@ pixman_asm_function fname - TMP2 .req r4 - PF_OFFS .req r7 - TMP3 .req r8 - TMP4 .req r9 - STRIDE .req r2 - - mov ip, sp - push {r4, r5, r6, r7, r8, r9} -- mov PF_OFFS, #prefetch_distance -+ mov PF_OFFS, #\prefetch_distance - ldmia ip, {WB, X, UX, WIDTH} - .else - OUT .req r0 - MASK .req r1 - TOP .req r2 - BOTTOM .req r3 - WT .req r4 - WB .req r5 -@@ -649,27 +649,27 @@ pixman_asm_function fname - WIDTH .req ip - TMP1 .req r4 - TMP2 .req r5 - PF_OFFS .req r8 - TMP3 .req r9 - TMP4 .req r10 - STRIDE .req r3 - -- .set prefetch_offset, prefetch_distance -+ .set prefetch_offset, \prefetch_distance - - mov ip, sp - push {r4, r5, r6, r7, r8, r9, r10, ip} -- mov PF_OFFS, #prefetch_distance -+ mov PF_OFFS, #\prefetch_distance - ldmia ip, {WT, WB, X, UX, WIDTH} - .endif - - mul PF_OFFS, PF_OFFS, UX - --.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 -+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpush {d8-d15} - .endif - - sub STRIDE, BOTTOM, TOP - .unreq BOTTOM - - cmp WIDTH, #0 - ble 3f -@@ -678,76 +678,76 @@ pixman_asm_function fname - vdup.u16 q13, UX - vdup.u8 d28, WT - vdup.u8 d29, WB - vadd.u16 d25, d25, d26 - - /* ensure good destination alignment */ - cmp WIDTH, #1 - blt 0f -- tst OUT, #(1 << dst_bpp_shift) -+ tst OUT, #(1 << \dst_bpp_shift) - beq 0f - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 -- bilinear_process_last_pixel -+ \bilinear_process_last_pixel - sub WIDTH, WIDTH, #1 - 0: - vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - - cmp WIDTH, #2 - blt 0f -- tst OUT, #(1 << (dst_bpp_shift + 1)) -+ tst OUT, #(1 << (\dst_bpp_shift + 1)) - beq 0f -- bilinear_process_two_pixels -+ \bilinear_process_two_pixels - sub WIDTH, WIDTH, #2 - 0: --.if pixblock_size == 8 -+.if \pixblock_size == 8 - cmp WIDTH, #4 - blt 0f -- tst OUT, #(1 << (dst_bpp_shift + 2)) -+ tst OUT, #(1 << (\dst_bpp_shift + 2)) - beq 0f -- bilinear_process_four_pixels -+ \bilinear_process_four_pixels - sub WIDTH, WIDTH, #4 - 0: - .endif -- subs WIDTH, WIDTH, #pixblock_size -+ subs WIDTH, WIDTH, #\pixblock_size - blt 1f -- mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) -- bilinear_process_pixblock_head -- subs WIDTH, WIDTH, #pixblock_size -+ mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift) -+ \bilinear_process_pixblock_head -+ subs WIDTH, WIDTH, #\pixblock_size - blt 5f - 0: -- bilinear_process_pixblock_tail_head -- subs WIDTH, WIDTH, #pixblock_size -+ \bilinear_process_pixblock_tail_head -+ subs WIDTH, WIDTH, #\pixblock_size - bge 0b - 5: -- bilinear_process_pixblock_tail -+ \bilinear_process_pixblock_tail - 1: --.if pixblock_size == 8 -+.if \pixblock_size == 8 - tst WIDTH, #4 - beq 2f -- bilinear_process_four_pixels -+ \bilinear_process_four_pixels - 2: - .endif - /* handle the remaining trailing pixels */ - tst WIDTH, #2 - beq 2f -- bilinear_process_two_pixels -+ \bilinear_process_two_pixels - 2: - tst WIDTH, #1 - beq 3f -- bilinear_process_last_pixel -+ \bilinear_process_last_pixel - 3: --.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 -+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpop {d8-d15} - .endif - --.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 -+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 - pop {r4, r5, r6, r7, r8, r9} - .else - pop {r4, r5, r6, r7, r8, r9, r10, ip} - .endif - bx lr - - .unreq OUT - .unreq TOP -@@ -757,21 +757,21 @@ 3: - .unreq UX - .unreq WIDTH - .unreq TMP1 - .unreq TMP2 - .unreq PF_OFFS - .unreq TMP3 - .unreq TMP4 - .unreq STRIDE --.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 -+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 - .unreq MASK - .endif - --.endfunc -+pixman_end_asm_function - - .endm - - /* src_8888_8_8888 */ - .macro bilinear_src_8888_8_8888_process_last_pixel - bilinear_interpolate_last_pixel 8888, 8, 8888, src - .endm - -diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S ---- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S -+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S -@@ -29,16 +29,22 @@ - * (those which are exposing some new or interesting features) are - * extensively commented and can be used as examples. - * - * You may want to have a look at the comments for following functions: - * - pixman_composite_over_8888_0565_asm_neon - * - pixman_composite_over_n_8_0565_asm_neon - */ - -+#ifdef __clang__ -+#define ldrgeb ldrbge -+#define subges subsge -+#define subpls subspl -+#endif -+ - /* Prevent the stack from becoming executable for no reason... */ - #if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits - #endif - - .text - .fpu neon - .arch armv7a -@@ -255,43 +261,43 @@ - vqadd.u8 d16, d2, d20 - vld1.16 {d4, d5}, [DST_R, :128]! - vqadd.u8 q9, q0, q11 - vshrn.u16 d6, q2, #8 - fetch_src_pixblock - vshrn.u16 d7, q2, #3 - vsli.u16 q2, q2, #5 - vshll.u8 q14, d16, #8 -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - vshll.u8 q8, d19, #8 -- PF tst PF_CTL, #0xF -+ PF tst, PF_CTL, #0xF - vsri.u8 d6, d6, #5 -- PF addne PF_X, PF_X, #8 -+ PF addne, PF_X, PF_X, #8 - vmvn.8 d3, d3 -- PF subne PF_CTL, PF_CTL, #1 -+ PF subne, PF_CTL, PF_CTL, #1 - vsri.u8 d7, d7, #6 - vshrn.u16 d30, q2, #2 - vmull.u8 q10, d3, d6 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmull.u8 q11, d3, d7 - vmull.u8 q12, d3, d30 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vsri.u16 q14, q8, #5 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vshll.u8 q9, d18, #8 - vrshr.u16 q13, q10, #8 -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vrshr.u16 q3, q11, #8 - vrshr.u16 q15, q12, #8 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vsri.u16 q14, q9, #11 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vraddhn.u16 d20, q10, q13 - vraddhn.u16 d23, q11, q3 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vraddhn.u16 d22, q12, q15 - vst1.16 {d28, d29}, [DST_W, :128]! - .endm - - #else - - /* If we did not care much about the performance, we would just use this... */ - .macro pixman_composite_over_8888_0565_process_pixblock_tail_head -@@ -429,30 +435,30 @@ generate_composite_function \ - - .macro pixman_composite_src_8888_0565_process_pixblock_tail - vsri.u16 q14, q8, #5 - vsri.u16 q14, q9, #11 - .endm - - .macro pixman_composite_src_8888_0565_process_pixblock_tail_head - vsri.u16 q14, q8, #5 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - fetch_src_pixblock -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vsri.u16 q14, q9, #11 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vshll.u8 q8, d1, #8 - vst1.16 {d28, d29}, [DST_W, :128]! -- PF subge PF_X, PF_X, ORIG_W -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subge, PF_X, PF_X, ORIG_W -+ PF subges, PF_CTL, PF_CTL, #0x10 - vshll.u8 q14, d2, #8 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vshll.u8 q9, d0, #8 - .endm - - generate_composite_function \ - pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ -@@ -504,30 +510,30 @@ generate_composite_function \ - vqadd.u8 q15, q1, q3 - .endm - - .macro pixman_composite_add_8_8_process_pixblock_tail - .endm - - .macro pixman_composite_add_8_8_process_pixblock_tail_head - fetch_src_pixblock -- PF add PF_X, PF_X, #32 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #32 -+ PF tst, PF_CTL, #0xF - vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! -- PF addne PF_X, PF_X, #32 -- PF subne PF_CTL, PF_CTL, #1 -+ PF addne, PF_X, PF_X, #32 -+ PF subne, PF_CTL, PF_CTL, #1 - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] -- PF subge PF_X, PF_X, ORIG_W -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subge, PF_X, PF_X, ORIG_W -+ PF subges, PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 - .endm - - generate_composite_function \ - pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ - FLAG_DST_READWRITE, \ - 32, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ -@@ -536,30 +542,30 @@ generate_composite_function \ - pixman_composite_add_8_8_process_pixblock_head, \ - pixman_composite_add_8_8_process_pixblock_tail, \ - pixman_composite_add_8_8_process_pixblock_tail_head - - /******************************************************************************/ - - .macro pixman_composite_add_8888_8888_process_pixblock_tail_head - fetch_src_pixblock -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] -- PF subge PF_X, PF_X, ORIG_W -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subge, PF_X, PF_X, ORIG_W -+ PF subges, PF_CTL, PF_CTL, #0x10 - vqadd.u8 q14, q0, q2 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q15, q1, q3 - .endm - - generate_composite_function \ - pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ -@@ -599,40 +605,40 @@ generate_composite_function_single_scanl - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - .endm - - .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subsge, PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 - .endm - - generate_composite_function_single_scanline \ - pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - default_init, \ -@@ -651,42 +657,42 @@ generate_composite_function_single_scanl - pixman_composite_out_reverse_8888_8888_process_pixblock_tail - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - .endm - - .macro pixman_composite_over_8888_8888_process_pixblock_tail_head - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q14, q8, #8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - fetch_src_pixblock - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vmull.u8 q10, d22, d6 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 - .endm - - generate_composite_function \ - pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ -@@ -737,30 +743,30 @@ generate_composite_function_single_scanl - vrshr.u16 q2, q10, #8 - vrshr.u16 q3, q11, #8 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 - vraddhn.u16 d30, q2, q10 - vraddhn.u16 d31, q3, q11 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vqadd.u8 q14, q0, q14 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0x0F -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0x0F -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vqadd.u8 q15, q1, q15 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vmull.u8 q8, d24, d4 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vmull.u8 q9, d24, d5 -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q10, d24, d6 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vmull.u8 q11, d24, d7 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - .endm - - .macro pixman_composite_over_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d3[0]}, [DUMMY] - vdup.8 d0, d3[0] - vdup.8 d1, d3[1] -@@ -779,40 +785,40 @@ generate_composite_function \ - pixman_composite_over_8888_8888_process_pixblock_head, \ - pixman_composite_over_8888_8888_process_pixblock_tail, \ - pixman_composite_over_n_8888_process_pixblock_tail_head - - /******************************************************************************/ - - .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF - vrshr.u16 q15, q9, #8 - vrshr.u16 q12, q10, #8 - vrshr.u16 q13, q11, #8 -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vraddhn.u16 d28, q14, q8 - vraddhn.u16 d29, q15, q9 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vraddhn.u16 d30, q12, q10 - vraddhn.u16 d31, q13, q11 - vqadd.u8 q14, q0, q14 - vqadd.u8 q15, q1, q15 - vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! - vmvn.8 d22, d3 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q8, d22, d4 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d22, d5 - vmull.u8 q10, d22, d6 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vmull.u8 q11, d22, d7 - .endm - - .macro pixman_composite_over_reverse_n_8888_init - add DUMMY, sp, #ARGS_STACK_OFFSET - vld1.32 {d7[0]}, [DUMMY] - vdup.8 d4, d7[0] - vdup.8 d5, d7[1] -@@ -1240,33 +1246,33 @@ generate_composite_function \ - vrshrn.u16 d28, q8, #8 - vrshrn.u16 d29, q9, #8 - vrshrn.u16 d30, q10, #8 - vrshrn.u16 d31, q11, #8 - .endm - - .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head - fetch_mask_pixblock -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - vrshrn.u16 d28, q8, #8 -- PF tst PF_CTL, #0x0F -+ PF tst, PF_CTL, #0x0F - vrshrn.u16 d29, q9, #8 -- PF addne PF_X, PF_X, #8 -+ PF addne, PF_X, PF_X, #8 - vrshrn.u16 d30, q10, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF subne, PF_CTL, PF_CTL, #1 - vrshrn.u16 d31, q11, #8 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vmull.u8 q8, d24, d0 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - vmull.u8 q9, d24, d1 -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q10, d24, d2 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vmull.u8 q11, d24, d3 -- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q8, q8, #8 - vrsra.u16 q9, q9, #8 - vrsra.u16 q10, q10, #8 - vrsra.u16 q11, q11, #8 - .endm - - .macro pixman_composite_src_n_8_8888_init -@@ -1309,33 +1315,33 @@ generate_composite_function \ - vrshrn.u16 d28, q0, #8 - vrshrn.u16 d29, q1, #8 - vrshrn.u16 d30, q2, #8 - vrshrn.u16 d31, q3, #8 - .endm - - .macro pixman_composite_src_n_8_8_process_pixblock_tail_head - fetch_mask_pixblock -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - vrshrn.u16 d28, q0, #8 -- PF tst PF_CTL, #0x0F -+ PF tst, PF_CTL, #0x0F - vrshrn.u16 d29, q1, #8 -- PF addne PF_X, PF_X, #8 -+ PF addne, PF_X, PF_X, #8 - vrshrn.u16 d30, q2, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF subne, PF_CTL, PF_CTL, #1 - vrshrn.u16 d31, q3, #8 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vmull.u8 q0, d24, d16 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - vmull.u8 q1, d25, d16 -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q2, d26, d16 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vmull.u8 q3, d27, d16 -- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - vrsra.u16 q0, q0, #8 - vrsra.u16 q1, q1, #8 - vrsra.u16 q2, q2, #8 - vrsra.u16 q3, q3, #8 - .endm - - .macro pixman_composite_src_n_8_8_init -@@ -1403,37 +1409,37 @@ generate_composite_function \ - .endm - - .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head - vrshr.u16 q14, q8, #8 - vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! - vrshr.u16 q15, q9, #8 - fetch_mask_pixblock - vrshr.u16 q6, q10, #8 -- PF add PF_X, PF_X, #8 -+ PF add, PF_X, PF_X, #8 - vrshr.u16 q7, q11, #8 -- PF tst PF_CTL, #0x0F -+ PF tst, PF_CTL, #0x0F - vraddhn.u16 d28, q14, q8 -- PF addne PF_X, PF_X, #8 -+ PF addne, PF_X, PF_X, #8 - vraddhn.u16 d29, q15, q9 -- PF subne PF_CTL, PF_CTL, #1 -+ PF subne, PF_CTL, PF_CTL, #1 - vraddhn.u16 d30, q6, q10 -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - vraddhn.u16 d31, q7, q11 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vmull.u8 q6, d24, d8 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - vmull.u8 q7, d24, d9 -- PF subge PF_X, PF_X, ORIG_W -+ PF subge, PF_X, PF_X, ORIG_W - vmull.u8 q8, d24, d10 -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subges, PF_CTL, PF_CTL, #0x10 - vmull.u8 q9, d24, d11 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vqadd.u8 q14, q0, q14 -- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - vqadd.u8 q15, q1, q15 - vrshr.u16 q10, q6, #8 - vrshr.u16 q11, q7, #8 - vrshr.u16 q12, q8, #8 - vrshr.u16 q13, q9, #8 - vraddhn.u16 d0, q6, q10 - vraddhn.u16 d1, q7, q11 - vraddhn.u16 d2, q8, q12 -@@ -2420,31 +2426,31 @@ generate_composite_function \ - - .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - fetch_src_pixblock - vraddhn.u16 d30, q11, q8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d28, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] -- PF subge PF_X, PF_X, ORIG_W -- PF subges PF_CTL, PF_CTL, #0x10 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF subge, PF_X, PF_X, ORIG_W -+ PF subges, PF_CTL, PF_CTL, #0x10 -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - .endm - - generate_composite_function \ - pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ -@@ -2477,31 +2483,31 @@ generate_composite_function \ - - .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head - vrshr.u16 q11, q8, #8 - vswp d3, d31 - vrshr.u16 q12, q9, #8 - vrshr.u16 q13, q10, #8 - fetch_src_pixblock - vraddhn.u16 d28, q11, q8 -- PF add PF_X, PF_X, #8 -- PF tst PF_CTL, #0xF -- PF addne PF_X, PF_X, #8 -- PF subne PF_CTL, PF_CTL, #1 -+ PF add, PF_X, PF_X, #8 -+ PF tst, PF_CTL, #0xF -+ PF addne, PF_X, PF_X, #8 -+ PF subne, PF_CTL, PF_CTL, #1 - vraddhn.u16 d29, q12, q9 - vraddhn.u16 d30, q13, q10 - vmull.u8 q8, d3, d0 - vmull.u8 q9, d3, d1 - vmull.u8 q10, d3, d2 - vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! -- PF cmp PF_X, ORIG_W -+ PF cmp, PF_X, ORIG_W - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] -- PF subge PF_X, PF_X, ORIG_W -- PF subges PF_CTL, PF_CTL, #0x10 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF subge, PF_X, PF_X, ORIG_W -+ PF subges, PF_CTL, PF_CTL, #0x10 -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - .endm - - generate_composite_function \ - pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 10, /* prefetch distance */ \ - default_init, \ -@@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan - * format conversion, and interpolation as separate macros which can be used - * as the basic building blocks for constructing bilinear scanline functions. - */ - - .macro bilinear_load_8888 reg1, reg2, tmp - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #2 -- vld1.32 {reg1}, [TMP1], STRIDE -- vld1.32 {reg2}, [TMP1] -+ vld1.32 {\reg1}, [TMP1], STRIDE -+ vld1.32 {\reg2}, [TMP1] - .endm - - .macro bilinear_load_0565 reg1, reg2, tmp - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 -- vld1.32 {reg2[0]}, [TMP1], STRIDE -- vld1.32 {reg2[1]}, [TMP1] -- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp -+ vld1.32 {\reg2[0]}, [TMP1], STRIDE -+ vld1.32 {\reg2[1]}, [TMP1] -+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp - .endm - - .macro bilinear_load_and_vertical_interpolate_two_8888 \ - acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - -- bilinear_load_8888 reg1, reg2, tmp1 -- vmull.u8 acc1, reg1, d28 -- vmlal.u8 acc1, reg2, d29 -- bilinear_load_8888 reg3, reg4, tmp2 -- vmull.u8 acc2, reg3, d28 -- vmlal.u8 acc2, reg4, d29 -+ bilinear_load_8888 \reg1, \reg2, \tmp1 -+ vmull.u8 \acc1, \reg1, d28 -+ vmlal.u8 \acc1, \reg2, d29 -+ bilinear_load_8888 \reg3, \reg4, \tmp2 -+ vmull.u8 \acc2, \reg3, d28 -+ vmlal.u8 \acc2, \reg4, d29 - .endm - - .macro bilinear_load_and_vertical_interpolate_four_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - bilinear_load_and_vertical_interpolate_two_8888 \ -- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi -+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi - bilinear_load_and_vertical_interpolate_two_8888 \ -- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi -+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi - .endm - - .macro bilinear_load_and_vertical_interpolate_two_0565 \ - acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi - - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 - mov TMP2, X, asr #16 - add X, X, UX - add TMP2, TOP, TMP2, asl #1 -- vld1.32 {acc2lo[0]}, [TMP1], STRIDE -- vld1.32 {acc2hi[0]}, [TMP2], STRIDE -- vld1.32 {acc2lo[1]}, [TMP1] -- vld1.32 {acc2hi[1]}, [TMP2] -- convert_0565_to_x888 acc2, reg3, reg2, reg1 -- vzip.u8 reg1, reg3 -- vzip.u8 reg2, reg4 -- vzip.u8 reg3, reg4 -- vzip.u8 reg1, reg2 -- vmull.u8 acc1, reg1, d28 -- vmlal.u8 acc1, reg2, d29 -- vmull.u8 acc2, reg3, d28 -- vmlal.u8 acc2, reg4, d29 -+ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE -+ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE -+ vld1.32 {\acc2lo[1]}, [TMP1] -+ vld1.32 {\acc2hi[1]}, [TMP2] -+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 -+ vzip.u8 \reg1, \reg3 -+ vzip.u8 \reg2, \reg4 -+ vzip.u8 \reg3, \reg4 -+ vzip.u8 \reg1, \reg2 -+ vmull.u8 \acc1, \reg1, d28 -+ vmlal.u8 \acc1, \reg2, d29 -+ vmull.u8 \acc2, \reg3, d28 -+ vmlal.u8 \acc2, \reg4, d29 - .endm - - .macro bilinear_load_and_vertical_interpolate_four_0565 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi - - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 - mov TMP2, X, asr #16 - add X, X, UX - add TMP2, TOP, TMP2, asl #1 -- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE -- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE -- vld1.32 {xacc2lo[1]}, [TMP1] -- vld1.32 {xacc2hi[1]}, [TMP2] -- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 -+ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE -+ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE -+ vld1.32 {\xacc2lo[1]}, [TMP1] -+ vld1.32 {\xacc2hi[1]}, [TMP2] -+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 - mov TMP1, X, asr #16 - add X, X, UX - add TMP1, TOP, TMP1, asl #1 - mov TMP2, X, asr #16 - add X, X, UX - add TMP2, TOP, TMP2, asl #1 -- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE -- vzip.u8 xreg1, xreg3 -- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE -- vzip.u8 xreg2, xreg4 -- vld1.32 {yacc2lo[1]}, [TMP1] -- vzip.u8 xreg3, xreg4 -- vld1.32 {yacc2hi[1]}, [TMP2] -- vzip.u8 xreg1, xreg2 -- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 -- vmull.u8 xacc1, xreg1, d28 -- vzip.u8 yreg1, yreg3 -- vmlal.u8 xacc1, xreg2, d29 -- vzip.u8 yreg2, yreg4 -- vmull.u8 xacc2, xreg3, d28 -- vzip.u8 yreg3, yreg4 -- vmlal.u8 xacc2, xreg4, d29 -- vzip.u8 yreg1, yreg2 -- vmull.u8 yacc1, yreg1, d28 -- vmlal.u8 yacc1, yreg2, d29 -- vmull.u8 yacc2, yreg3, d28 -- vmlal.u8 yacc2, yreg4, d29 -+ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE -+ vzip.u8 \xreg1, \xreg3 -+ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE -+ vzip.u8 \xreg2, \xreg4 -+ vld1.32 {\yacc2lo[1]}, [TMP1] -+ vzip.u8 \xreg3, \xreg4 -+ vld1.32 {\yacc2hi[1]}, [TMP2] -+ vzip.u8 \xreg1, \xreg2 -+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 -+ vmull.u8 \xacc1, \xreg1, d28 -+ vzip.u8 \yreg1, \yreg3 -+ vmlal.u8 \xacc1, \xreg2, d29 -+ vzip.u8 \yreg2, \yreg4 -+ vmull.u8 \xacc2, \xreg3, d28 -+ vzip.u8 \yreg3, \yreg4 -+ vmlal.u8 \xacc2, \xreg4, d29 -+ vzip.u8 \yreg1, \yreg2 -+ vmull.u8 \yacc1, \yreg1, d28 -+ vmlal.u8 \yacc1, \yreg2, d29 -+ vmull.u8 \yacc2, \yreg3, d28 -+ vmlal.u8 \yacc2, \yreg4, d29 - .endm - - .macro bilinear_store_8888 numpix, tmp1, tmp2 --.if numpix == 4 -+.if \numpix == 4 - vst1.32 {d0, d1}, [OUT, :128]! --.elseif numpix == 2 -+.elseif \numpix == 2 - vst1.32 {d0}, [OUT, :64]! --.elseif numpix == 1 -+.elseif \numpix == 1 - vst1.32 {d0[0]}, [OUT, :32]! - .else -- .error bilinear_store_8888 numpix is unsupported -+ .error bilinear_store_8888 \numpix is unsupported - .endif - .endm - - .macro bilinear_store_0565 numpix, tmp1, tmp2 - vuzp.u8 d0, d1 - vuzp.u8 d2, d3 - vuzp.u8 d1, d3 - vuzp.u8 d0, d2 -- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 --.if numpix == 4 -+ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 -+.if \numpix == 4 - vst1.16 {d2}, [OUT, :64]! --.elseif numpix == 2 -+.elseif \numpix == 2 - vst1.32 {d2[0]}, [OUT, :32]! --.elseif numpix == 1 -+.elseif \numpix == 1 - vst1.16 {d2[0]}, [OUT, :16]! - .else -- .error bilinear_store_0565 numpix is unsupported -+ .error bilinear_store_0565 \numpix is unsupported - .endif - .endm - - .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt -- bilinear_load_&src_fmt d0, d1, d2 -+ bilinear_load_\()\src_fmt d0, d1, d2 - vmull.u8 q1, d0, d28 - vmlal.u8 q1, d1, d29 - /* 5 cycles bubble */ - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - /* 5 cycles bubble */ - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - /* 3 cycles bubble */ - vmovn.u16 d0, q0 - /* 1 cycle bubble */ -- bilinear_store_&dst_fmt 1, q2, q3 -+ bilinear_store_\()\dst_fmt 1, q2, q3 - .endm - - .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt -- bilinear_load_and_vertical_interpolate_two_&src_fmt \ -+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q10, d22, d31 - vmlal.u16 q10, d23, d31 - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - vmovn.u16 d0, q0 -- bilinear_store_&dst_fmt 2, q2, q3 -+ bilinear_store_\()\dst_fmt 2, q2, q3 - .endm - - .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt -- bilinear_load_and_vertical_interpolate_four_&src_fmt \ -+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ - q1, q11, d0, d1, d20, d21, d22, d23 \ - q3, q9, d4, d5, d16, d17, d18, d19 - pld [TMP1, PF_OFFS] - sub TMP1, TMP1, STRIDE - vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS - vmlsl.u16 q0, d2, d30 - vmlal.u16 q0, d3, d30 - vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS -@@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan - vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) - vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vmovn.u16 d0, q0 - vmovn.u16 d1, q2 - vadd.u16 q12, q12, q13 -- bilinear_store_&dst_fmt 4, q2, q3 -+ bilinear_store_\()\dst_fmt 4, q2, q3 - .endm - - .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head -+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head - .else -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail -+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail - .endif - .endm - - .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head -+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head - .else -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head -+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head - .else -- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail -+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail - .else -- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt - .endif - .endm - - .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt --.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt -- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head -+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt -+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head - .else -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt - .endif - .endm - - .set BILINEAR_FLAG_UNROLL_4, 0 - .set BILINEAR_FLAG_UNROLL_8, 1 - .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 - - /* -@@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan - * prefetch_distance - prefetch in the source image by that many - * pixels ahead - */ - - .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ - src_bpp_shift, dst_bpp_shift, \ - prefetch_distance, flags - --pixman_asm_function fname -+pixman_asm_function \fname - OUT .req r0 - TOP .req r1 - BOTTOM .req r2 - WT .req r3 - WB .req r4 - X .req r5 - UX .req r6 - WIDTH .req ip -@@ -3119,21 +3125,21 @@ pixman_asm_function fname - TMP2 .req r4 - PF_OFFS .req r7 - TMP3 .req r8 - TMP4 .req r9 - STRIDE .req r2 - - mov ip, sp - push {r4, r5, r6, r7, r8, r9} -- mov PF_OFFS, #prefetch_distance -+ mov PF_OFFS, #\prefetch_distance - ldmia ip, {WB, X, UX, WIDTH} - mul PF_OFFS, PF_OFFS, UX - --.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 -+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpush {d8-d15} - .endif - - sub STRIDE, BOTTOM, TOP - .unreq BOTTOM - - cmp WIDTH, #0 - ble 3f -@@ -3146,83 +3152,83 @@ pixman_asm_function fname - - /* ensure good destination alignment */ - cmp WIDTH, #1 - blt 0f - tst OUT, #(1 << dst_bpp_shift) - beq 0f - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 -- bilinear_interpolate_last_pixel src_fmt, dst_fmt -+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt - sub WIDTH, WIDTH, #1 - 0: - vadd.u16 q13, q13, q13 - vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) - vadd.u16 q12, q12, q13 - - cmp WIDTH, #2 - blt 0f - tst OUT, #(1 << (dst_bpp_shift + 1)) - beq 0f -- bilinear_interpolate_two_pixels src_fmt, dst_fmt -+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt - sub WIDTH, WIDTH, #2 - 0: --.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 -+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 - /*********** 8 pixels per iteration *****************/ - cmp WIDTH, #4 - blt 0f - tst OUT, #(1 << (dst_bpp_shift + 2)) - beq 0f -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - sub WIDTH, WIDTH, #4 - 0: - subs WIDTH, WIDTH, #8 - blt 1f - mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) -- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt -+ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #8 - blt 5f - 0: -- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #8 - bge 0b - 5: -- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt -+ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt - 1: - tst WIDTH, #4 - beq 2f -- bilinear_interpolate_four_pixels src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt - 2: - .else - /*********** 4 pixels per iteration *****************/ - subs WIDTH, WIDTH, #4 - blt 1f - mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) -- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #4 - blt 5f - 0: -- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt - subs WIDTH, WIDTH, #4 - bge 0b - 5: -- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt - 1: - /****************************************************/ - .endif - /* handle the remaining trailing pixels */ - tst WIDTH, #2 - beq 2f -- bilinear_interpolate_two_pixels src_fmt, dst_fmt -+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt - 2: - tst WIDTH, #1 - beq 3f -- bilinear_interpolate_last_pixel src_fmt, dst_fmt -+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt - 3: --.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 -+.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 - vpop {d8-d15} - .endif - pop {r4, r5, r6, r7, r8, r9} - bx lr - - .unreq OUT - .unreq TOP - .unreq WT -@@ -3231,17 +3237,17 @@ 3: - .unreq UX - .unreq WIDTH - .unreq TMP1 - .unreq TMP2 - .unreq PF_OFFS - .unreq TMP3 - .unreq TMP4 - .unreq STRIDE --.endfunc -+pixman_end_asm_function - - .endm - - /*****************************************************************************/ - - .set have_bilinear_interpolate_four_pixels_8888_8888, 1 - - .macro bilinear_interpolate_four_pixels_8888_8888_head -diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h ---- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h -+++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h -@@ -69,303 +69,303 @@ - .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ - - /* - * Definitions of supplementary pixld/pixst macros (for partial load/store of - * pixel data). - */ - - .macro pixldst1 op, elem_size, reg1, mem_operand, abits --.if abits > 0 -- op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! -+.if \abits > 0 -+ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]! - .else -- op&.&elem_size {d®1}, [&mem_operand&]! -+ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]! - .endif - .endm - - .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits --.if abits > 0 -- op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! -+.if \abits > 0 -+ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]! - .else -- op&.&elem_size {d®1, d®2}, [&mem_operand&]! -+ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]! - .endif - .endm - - .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits --.if abits > 0 -- op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! -+.if \abits > 0 -+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]! - .else -- op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! -+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]! - .endif - .endm - - .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits -- op&.&elem_size {d®1[idx]}, [&mem_operand&]! -+ \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]! - .endm - - .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand -- op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! -+ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]! - .endm - - .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand -- op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! -+ \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]! - .endm - - .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits --.if numbytes == 32 -- pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits --.elseif numbytes == 16 -- pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits --.elseif numbytes == 8 -- pixldst1 op, elem_size, %(basereg+1), mem_operand, abits --.elseif numbytes == 4 -- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) -- pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits -- .elseif elem_size == 16 -- pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits -- pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits -+.if \numbytes == 32 -+ pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits -+.elseif \numbytes == 16 -+ pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits -+.elseif \numbytes == 8 -+ pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits -+.elseif \numbytes == 4 -+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) -+ pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits -+ .elseif \elem_size == 16 -+ pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits -+ pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits - .else -- pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits -- pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits -- pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits -- pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits -+ pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits -+ pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits -+ pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits -+ pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits - .endif --.elseif numbytes == 2 -- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) -- pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits -+.elseif \numbytes == 2 -+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) -+ pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits - .else -- pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits -- pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits -+ pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits -+ pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits - .endif --.elseif numbytes == 1 -- pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits -+.elseif \numbytes == 1 -+ pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits - .else -- .error "unsupported size: numbytes" -+ .error "unsupported size: \numbytes" - .endif - .endm - - .macro pixld numpix, bpp, basereg, mem_operand, abits=0 --.if bpp > 0 --.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits --.elseif (bpp == 24) && (numpix == 8) -- pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand --.elseif (bpp == 24) && (numpix == 4) -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand --.elseif (bpp == 24) && (numpix == 2) -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand --.elseif (bpp == 24) && (numpix == 1) -- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand -+.if \bpp > 0 -+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits -+.elseif (\bpp == 24) && (\numpix == 8) -+ pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand -+.elseif (\bpp == 24) && (\numpix == 4) -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 2) -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 1) -+ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand - .else -- pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits -+ pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits - .endif - .endif - .endm - - .macro pixst numpix, bpp, basereg, mem_operand, abits=0 --.if bpp > 0 --.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ -- %(basereg+6), %(basereg+7), mem_operand, abits --.elseif (bpp == 24) && (numpix == 8) -- pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand --.elseif (bpp == 24) && (numpix == 4) -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand --.elseif (bpp == 24) && (numpix == 2) -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand --.elseif (bpp == 24) && (numpix == 1) -- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand -+.if \bpp > 0 -+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \ -+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits -+.elseif (\bpp == 24) && (\numpix == 8) -+ pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand -+.elseif (\bpp == 24) && (\numpix == 4) -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 2) -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand -+.elseif (\bpp == 24) && (\numpix == 1) -+ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand - .else -- pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits -+ pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits - .endif - .endif - .endm - - .macro pixld_a numpix, bpp, basereg, mem_operand --.if (bpp * numpix) <= 128 -- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) -+.if (\bpp * \numpix) <= 128 -+ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) - .else -- pixld numpix, bpp, basereg, mem_operand, 128 -+ pixld \numpix, \bpp, \basereg, \mem_operand, 128 - .endif - .endm - - .macro pixst_a numpix, bpp, basereg, mem_operand --.if (bpp * numpix) <= 128 -- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) -+.if (\bpp * \numpix) <= 128 -+ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) - .else -- pixst numpix, bpp, basereg, mem_operand, 128 -+ pixst \numpix, \bpp, \basereg, \mem_operand, 128 - .endif - .endm - - /* - * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register - * aliases to be defined) - */ - .macro pixld1_s elem_size, reg1, mem_operand --.if elem_size == 16 -+.if \elem_size == 16 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP1, mem_operand, TMP1, asl #1 -+ add TMP1, \mem_operand, TMP1, asl #1 - mov TMP2, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP2, mem_operand, TMP2, asl #1 -- vld1.16 {d®1&[0]}, [TMP1, :16] -+ add TMP2, \mem_operand, TMP2, asl #1 -+ vld1.16 {d\()\reg1\()[0]}, [TMP1, :16] - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP1, mem_operand, TMP1, asl #1 -- vld1.16 {d®1&[1]}, [TMP2, :16] -+ add TMP1, \mem_operand, TMP1, asl #1 -+ vld1.16 {d\()\reg1\()[1]}, [TMP2, :16] - mov TMP2, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP2, mem_operand, TMP2, asl #1 -- vld1.16 {d®1&[2]}, [TMP1, :16] -- vld1.16 {d®1&[3]}, [TMP2, :16] --.elseif elem_size == 32 -+ add TMP2, \mem_operand, TMP2, asl #1 -+ vld1.16 {d\()\reg1\()[2]}, [TMP1, :16] -+ vld1.16 {d\()\reg1\()[3]}, [TMP2, :16] -+.elseif \elem_size == 32 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP1, mem_operand, TMP1, asl #2 -+ add TMP1, \mem_operand, TMP1, asl #2 - mov TMP2, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP2, mem_operand, TMP2, asl #2 -- vld1.32 {d®1&[0]}, [TMP1, :32] -- vld1.32 {d®1&[1]}, [TMP2, :32] -+ add TMP2, \mem_operand, TMP2, asl #2 -+ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32] -+ vld1.32 {d\()\reg1\()[1]}, [TMP2, :32] - .else - .error "unsupported" - .endif - .endm - - .macro pixld2_s elem_size, reg1, reg2, mem_operand - .if 0 /* elem_size == 32 */ - mov TMP1, VX, asr #16 - add VX, VX, UNIT_X, asl #1 -- add TMP1, mem_operand, TMP1, asl #2 -+ add TMP1, \mem_operand, TMP1, asl #2 - mov TMP2, VX, asr #16 - sub VX, VX, UNIT_X -- add TMP2, mem_operand, TMP2, asl #2 -- vld1.32 {d®1&[0]}, [TMP1, :32] -+ add TMP2, \mem_operand, TMP2, asl #2 -+ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32] - mov TMP1, VX, asr #16 - add VX, VX, UNIT_X, asl #1 -- add TMP1, mem_operand, TMP1, asl #2 -- vld1.32 {d®2&[0]}, [TMP2, :32] -+ add TMP1, \mem_operand, TMP1, asl #2 -+ vld1.32 {d\()\reg2\()[0]}, [TMP2, :32] - mov TMP2, VX, asr #16 - add VX, VX, UNIT_X -- add TMP2, mem_operand, TMP2, asl #2 -- vld1.32 {d®1&[1]}, [TMP1, :32] -- vld1.32 {d®2&[1]}, [TMP2, :32] -+ add TMP2, \mem_operand, TMP2, asl #2 -+ vld1.32 {d\()\reg1\()[1]}, [TMP1, :32] -+ vld1.32 {d\()\reg2\()[1]}, [TMP2, :32] - .else -- pixld1_s elem_size, reg1, mem_operand -- pixld1_s elem_size, reg2, mem_operand -+ pixld1_s \elem_size, \reg1, \mem_operand -+ pixld1_s \elem_size, \reg2, \mem_operand - .endif - .endm - - .macro pixld0_s elem_size, reg1, idx, mem_operand --.if elem_size == 16 -+.if \elem_size == 16 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP1, mem_operand, TMP1, asl #1 -- vld1.16 {d®1&[idx]}, [TMP1, :16] --.elseif elem_size == 32 -+ add TMP1, \mem_operand, TMP1, asl #1 -+ vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16] -+.elseif \elem_size == 32 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X - 5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b -- add TMP1, mem_operand, TMP1, asl #2 -- vld1.32 {d®1&[idx]}, [TMP1, :32] -+ add TMP1, \mem_operand, TMP1, asl #2 -+ vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32] - .endif - .endm - - .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand --.if numbytes == 32 -- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand -- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand -- pixdeinterleave elem_size, %(basereg+4) --.elseif numbytes == 16 -- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand --.elseif numbytes == 8 -- pixld1_s elem_size, %(basereg+1), mem_operand --.elseif numbytes == 4 -- .if elem_size == 32 -- pixld0_s elem_size, %(basereg+0), 1, mem_operand -- .elseif elem_size == 16 -- pixld0_s elem_size, %(basereg+0), 2, mem_operand -- pixld0_s elem_size, %(basereg+0), 3, mem_operand -+.if \numbytes == 32 -+ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand -+ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand -+ pixdeinterleave \elem_size, %(\basereg+4) -+.elseif \numbytes == 16 -+ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand -+.elseif \numbytes == 8 -+ pixld1_s \elem_size, %(\basereg+1), \mem_operand -+.elseif \numbytes == 4 -+ .if \elem_size == 32 -+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand -+ .elseif \elem_size == 16 -+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand - .else -- pixld0_s elem_size, %(basereg+0), 4, mem_operand -- pixld0_s elem_size, %(basereg+0), 5, mem_operand -- pixld0_s elem_size, %(basereg+0), 6, mem_operand -- pixld0_s elem_size, %(basereg+0), 7, mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand - .endif --.elseif numbytes == 2 -- .if elem_size == 16 -- pixld0_s elem_size, %(basereg+0), 1, mem_operand -+.elseif \numbytes == 2 -+ .if \elem_size == 16 -+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand - .else -- pixld0_s elem_size, %(basereg+0), 2, mem_operand -- pixld0_s elem_size, %(basereg+0), 3, mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand -+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand - .endif --.elseif numbytes == 1 -- pixld0_s elem_size, %(basereg+0), 1, mem_operand -+.elseif \numbytes == 1 -+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand - .else -- .error "unsupported size: numbytes" -+ .error "unsupported size: \numbytes" - .endif - .endm - - .macro pixld_s numpix, bpp, basereg, mem_operand --.if bpp > 0 -- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand -+.if \bpp > 0 -+ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand - .endif - .endm - - .macro vuzp8 reg1, reg2 -- vuzp.8 d®1, d®2 -+ vuzp.8 d\()\reg1, d\()\reg2 - .endm - - .macro vzip8 reg1, reg2 -- vzip.8 d®1, d®2 -+ vzip.8 d\()\reg1, d\()\reg2 - .endm - - /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ - .macro pixdeinterleave bpp, basereg --.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- vuzp8 %(basereg+0), %(basereg+1) -- vuzp8 %(basereg+2), %(basereg+3) -- vuzp8 %(basereg+1), %(basereg+3) -- vuzp8 %(basereg+0), %(basereg+2) -+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ vuzp8 %(\basereg+0), %(\basereg+1) -+ vuzp8 %(\basereg+2), %(\basereg+3) -+ vuzp8 %(\basereg+1), %(\basereg+3) -+ vuzp8 %(\basereg+0), %(\basereg+2) - .endif - .endm - - /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ - .macro pixinterleave bpp, basereg --.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -- vzip8 %(basereg+0), %(basereg+2) -- vzip8 %(basereg+1), %(basereg+3) -- vzip8 %(basereg+2), %(basereg+3) -- vzip8 %(basereg+0), %(basereg+1) -+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) -+ vzip8 %(\basereg+0), %(\basereg+2) -+ vzip8 %(\basereg+1), %(\basereg+3) -+ vzip8 %(\basereg+2), %(\basereg+3) -+ vzip8 %(\basereg+0), %(\basereg+1) - .endif - .endm - - /* - * This is a macro for implementing cache preload. The main idea is that - * cache preload logic is mostly independent from the rest of pixels - * processing code. It starts at the top left pixel and moves forward - * across pixels and can jump across scanlines. Prefetch distance is -@@ -389,51 +389,51 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED - * for almost zero cost! - * - * (*) The overhead of the prefetcher is visible when running some trivial - * pixels processing like simple copy. Anyway, having prefetch is a must - * when working with the graphics data. - */ - .macro PF a, x:vararg - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) -- a x -+ \a \x - .endif - .endm - - .macro cache_preload std_increment, boost_increment - .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) - .if regs_shortage -- PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ -+ PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ - .endif --.if std_increment != 0 -- PF add PF_X, PF_X, #std_increment -+.if \std_increment != 0 -+ PF add, PF_X, PF_X, #\std_increment - .endif -- PF tst PF_CTL, #0xF -- PF addne PF_X, PF_X, #boost_increment -- PF subne PF_CTL, PF_CTL, #1 -- PF cmp PF_X, ORIG_W -+ PF tst, PF_CTL, #0xF -+ PF addne, PF_X, PF_X, #\boost_increment -+ PF subne, PF_CTL, PF_CTL, #1 -+ PF cmp, PF_X, ORIG_W - .if src_bpp_shift >= 0 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - .endif - .if dst_r_bpp != 0 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - .endif - .if mask_bpp_shift >= 0 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] - .endif -- PF subge PF_X, PF_X, ORIG_W -- PF subges PF_CTL, PF_CTL, #0x10 -+ PF subge, PF_X, PF_X, ORIG_W -+ PF subges, PF_CTL, PF_CTL, #0x10 - .if src_bpp_shift >= 0 -- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - .endif - .if dst_r_bpp != 0 -- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - .endif - .if mask_bpp_shift >= 0 -- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! -+ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! - .endif - .endif - .endm - - .macro cache_preload_simple - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) - .if src_bpp > 0 - pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] -@@ -460,51 +460,53 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED - .macro ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - .if dst_w_bpp != 24 - tst DST_R, #0xF - beq 2f - - .irp lowbit, 1, 2, 4, 8, 16 -+#ifndef __clang__ - local skip1 --.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) --.if lowbit < 16 /* we don't need more than 16-byte alignment */ -- tst DST_R, #lowbit -+#endif -+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) -+.if \lowbit < 16 /* we don't need more than 16-byte alignment */ -+ tst DST_R, #\lowbit - beq 1f - .endif -- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC -- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK -+ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC -+ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK - .if dst_r_bpp > 0 -- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R -+ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R - .else -- add DST_R, DST_R, #lowbit -+ add DST_R, DST_R, #\lowbit - .endif -- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) -- sub W, W, #(lowbit * 8 / dst_w_bpp) -+ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) -+ sub W, W, #(\lowbit * 8 / dst_w_bpp) - 1: - .endif - .endr - pixdeinterleave src_bpp, src_basereg - pixdeinterleave mask_bpp, mask_basereg - pixdeinterleave dst_r_bpp, dst_r_basereg - -- process_pixblock_head -+ \process_pixblock_head - cache_preload 0, pixblock_size - cache_preload_simple -- process_pixblock_tail -+ \process_pixblock_tail - - pixinterleave dst_w_bpp, dst_w_basereg - .irp lowbit, 1, 2, 4, 8, 16 --.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) --.if lowbit < 16 /* we don't need more than 16-byte alignment */ -- tst DST_W, #lowbit -+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) -+.if \lowbit < 16 /* we don't need more than 16-byte alignment */ -+ tst DST_W, #\lowbit - beq 1f - .endif -- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W -+ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W - 1: - .endif - .endr - .endif - 2: - .endm - - /* -@@ -525,51 +527,51 @@ 2: - .macro process_trailing_pixels cache_preload_flag, \ - dst_aligned_flag, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - tst W, #(pixblock_size - 1) - beq 2f - .irp chunk_size, 16, 8, 4, 2, 1 --.if pixblock_size > chunk_size -- tst W, #chunk_size -+.if pixblock_size > \chunk_size -+ tst W, #\chunk_size - beq 1f -- pixld_src chunk_size, src_bpp, src_basereg, SRC -- pixld chunk_size, mask_bpp, mask_basereg, MASK --.if dst_aligned_flag != 0 -- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R -+ pixld_src \chunk_size, src_bpp, src_basereg, SRC -+ pixld \chunk_size, mask_bpp, mask_basereg, MASK -+.if \dst_aligned_flag != 0 -+ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R - .else -- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R -+ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R - .endif --.if cache_preload_flag != 0 -- PF add PF_X, PF_X, #chunk_size -+.if \cache_preload_flag != 0 -+ PF add, PF_X, PF_X, #\chunk_size - .endif - 1: - .endif - .endr - pixdeinterleave src_bpp, src_basereg - pixdeinterleave mask_bpp, mask_basereg - pixdeinterleave dst_r_bpp, dst_r_basereg - -- process_pixblock_head --.if cache_preload_flag != 0 -+ \process_pixblock_head -+.if \cache_preload_flag != 0 - cache_preload 0, pixblock_size - cache_preload_simple - .endif -- process_pixblock_tail -+ \process_pixblock_tail - pixinterleave dst_w_bpp, dst_w_basereg - .irp chunk_size, 16, 8, 4, 2, 1 --.if pixblock_size > chunk_size -- tst W, #chunk_size -+.if pixblock_size > \chunk_size -+ tst W, #\chunk_size - beq 1f --.if dst_aligned_flag != 0 -- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W -+.if \dst_aligned_flag != 0 -+ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W - .else -- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W -+ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W - .endif - 1: - .endif - .endr - 2: - .endm - - /* -@@ -599,17 +601,17 @@ 2: - .if (mask_bpp != 24) && (mask_bpp != 0) - sub MASK, MASK, W, lsl #mask_bpp_shift - .endif - subs H, H, #1 - mov DST_R, DST_W - .if regs_shortage - str H, [sp, #4] /* save updated height to stack */ - .endif -- bge start_of_loop_label -+ bge \start_of_loop_label - .endm - - /* - * Registers are allocated in the following way by default: - * d0, d1, d2, d3 - reserved for loading source pixel data - * d4, d5, d6, d7 - reserved for loading destination pixel data - * d24, d25, d26, d27 - reserved for loading mask pixel data - * d28, d29, d30, d31 - final destination pixel data for writeback to memory -@@ -626,48 +628,48 @@ 2: - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head, \ - dst_w_basereg_ = 28, \ - dst_r_basereg_ = 4, \ - src_basereg_ = 0, \ - mask_basereg_ = 24 - -- pixman_asm_function fname -+ pixman_asm_function \fname - - push {r4-r12, lr} /* save all registers */ - - /* - * Select prefetch type for this function. If prefetch distance is - * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch - * has to be used instead of ADVANCED. - */ - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT --.if prefetch_distance == 0 -+.if \prefetch_distance == 0 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ -- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) -+ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE - .endif - - /* - * Make some macro arguments globally visible and accessible - * from other macros - */ -- .set src_bpp, src_bpp_ -- .set mask_bpp, mask_bpp_ -- .set dst_w_bpp, dst_w_bpp_ -- .set pixblock_size, pixblock_size_ -- .set dst_w_basereg, dst_w_basereg_ -- .set dst_r_basereg, dst_r_basereg_ -- .set src_basereg, src_basereg_ -- .set mask_basereg, mask_basereg_ -+ .set src_bpp, \src_bpp_ -+ .set mask_bpp, \mask_bpp_ -+ .set dst_w_bpp, \dst_w_bpp_ -+ .set pixblock_size, \pixblock_size_ -+ .set dst_w_basereg, \dst_w_basereg_ -+ .set dst_r_basereg, \dst_r_basereg_ -+ .set src_basereg, \src_basereg_ -+ .set mask_basereg, \mask_basereg_ - - .macro pixld_src x:vararg -- pixld x -+ pixld \x - .endm - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC - .endm - /* - * Assign symbolic names to registers - */ -@@ -750,38 +752,38 @@ 2: - .elseif dst_w_bpp == 16 - .set dst_bpp_shift, 1 - .elseif dst_w_bpp == 8 - .set dst_bpp_shift, 0 - .else - .error "requested dst bpp (dst_w_bpp) is not supported" - .endif - --.if (((flags) & FLAG_DST_READWRITE) != 0) -+.if (((\flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp - .else - .set dst_r_bpp, 0 - .endif --.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) -+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) - .set DEINTERLEAVE_32BPP_ENABLED, 1 - .else - .set DEINTERLEAVE_32BPP_ENABLED, 0 - .endif - --.if prefetch_distance < 0 || prefetch_distance > 15 -- .error "invalid prefetch distance (prefetch_distance)" -+.if \prefetch_distance < 0 || \prefetch_distance > 15 -+ .error "invalid prefetch distance (\prefetch_distance)" - .endif - - .if src_bpp > 0 - ldr SRC, [sp, #40] - .endif - .if mask_bpp > 0 - ldr MASK, [sp, #48] - .endif -- PF mov PF_X, #0 -+ PF mov, PF_X, #0 - .if src_bpp > 0 - ldr SRC_STRIDE, [sp, #44] - .endif - .if mask_bpp > 0 - ldr MASK_STRIDE, [sp, #52] - .endif - mov DST_R, DST_W - -@@ -796,24 +798,24 @@ 2: - .if dst_w_bpp == 24 - sub DST_STRIDE, DST_STRIDE, W - sub DST_STRIDE, DST_STRIDE, W, lsl #1 - .endif - - /* - * Setup advanced prefetcher initial state - */ -- PF mov PF_SRC, SRC -- PF mov PF_DST, DST_R -- PF mov PF_MASK, MASK -+ PF mov, PF_SRC, SRC -+ PF mov, PF_DST, DST_R -+ PF mov, PF_MASK, MASK - /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ -- PF mov PF_CTL, H, lsl #4 -- PF add PF_CTL, #(prefetch_distance - 0x10) -+ PF mov, PF_CTL, H, lsl #4 -+ PF add, PF_CTL, #(\prefetch_distance - 0x10) - -- init -+ \init - .if regs_shortage - push {r0, r1} - .endif - subs H, H, #1 - .if regs_shortage - str H, [sp, #4] /* save updated height to stack */ - .else - mov ORIG_W, W -@@ -821,84 +823,84 @@ 2: - blt 9f - cmp W, #(pixblock_size * 2) - blt 8f - /* - * This is the start of the pipelined loop, which if optimized for - * long scanlines - */ - 0: -- ensure_destination_ptr_alignment process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ ensure_destination_ptr_alignment \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ - pixld_a pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- PF add PF_X, PF_X, #pixblock_size -- process_pixblock_head -+ PF add, PF_X, PF_X, #pixblock_size -+ \process_pixblock_head - cache_preload 0, pixblock_size - cache_preload_simple - subs W, W, #(pixblock_size * 2) - blt 2f - 1: -- process_pixblock_tail_head -+ \process_pixblock_tail_head - cache_preload_simple - subs W, W, #pixblock_size - bge 1b - 2: -- process_pixblock_tail -+ \process_pixblock_tail - pixst_a pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - - /* Process the remaining trailing pixels in the scanline */ - process_trailing_pixels 1, 1, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - advance_to_next_scanline 0b - - .if regs_shortage - pop {r0, r1} - .endif -- cleanup -+ \cleanup - pop {r4-r12, pc} /* exit */ - /* - * This is the start of the loop, designed to process images with small width - * (less than pixblock_size * 2 pixels). In this case neither pipelining - * nor prefetch are used. - */ - 8: - /* Process exactly pixblock_size pixels if needed */ - tst W, #pixblock_size - beq 1f - pixld pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- process_pixblock_head -- process_pixblock_tail -+ \process_pixblock_head -+ \process_pixblock_tail - pixst pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - 1: - /* Process the remaining trailing pixels in the scanline */ - process_trailing_pixels 0, 0, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - advance_to_next_scanline 8b - 9: - .if regs_shortage - pop {r0, r1} - .endif -- cleanup -+ \cleanup - pop {r4-r12, pc} /* exit */ - - .purgem fetch_src_pixblock - .purgem pixld_src - - .unreq SRC - .unreq MASK - .unreq DST_R -@@ -910,17 +912,17 @@ 9: - .unreq DST_STRIDE - .unreq MASK_STRIDE - .unreq PF_CTL - .unreq PF_X - .unreq PF_SRC - .unreq PF_DST - .unreq PF_MASK - .unreq DUMMY -- .endfunc -+ pixman_end_asm_function - .endm - - /* - * A simplified variant of function generation template for a single - * scanline processing (for implementing pixman combine functions) - */ - .macro generate_composite_function_scanline use_nearest_scaling, \ - fname, \ -@@ -934,49 +936,49 @@ 9: - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head, \ - dst_w_basereg_ = 28, \ - dst_r_basereg_ = 4, \ - src_basereg_ = 0, \ - mask_basereg_ = 24 - -- pixman_asm_function fname -+ pixman_asm_function \fname - - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - /* - * Make some macro arguments globally visible and accessible - * from other macros - */ -- .set src_bpp, src_bpp_ -- .set mask_bpp, mask_bpp_ -- .set dst_w_bpp, dst_w_bpp_ -- .set pixblock_size, pixblock_size_ -- .set dst_w_basereg, dst_w_basereg_ -- .set dst_r_basereg, dst_r_basereg_ -- .set src_basereg, src_basereg_ -- .set mask_basereg, mask_basereg_ -+ .set src_bpp, \src_bpp_ -+ .set mask_bpp, \mask_bpp_ -+ .set dst_w_bpp, \dst_w_bpp_ -+ .set pixblock_size, \pixblock_size_ -+ .set dst_w_basereg, \dst_w_basereg_ -+ .set dst_r_basereg, \dst_r_basereg_ -+ .set src_basereg, \src_basereg_ -+ .set mask_basereg, \mask_basereg_ - --.if use_nearest_scaling != 0 -+.if \use_nearest_scaling != 0 - /* - * Assign symbolic names to registers for nearest scaling - */ - W .req r0 - DST_W .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - MASK .req lr - TMP1 .req r4 - TMP2 .req r5 - DST_R .req r6 - SRC_WIDTH_FIXED .req r7 - - .macro pixld_src x:vararg -- pixld_s x -+ pixld_s \x - .endm - - ldr UNIT_X, [sp] - push {r4-r8, lr} - ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] - .if mask_bpp != 0 - ldr MASK, [sp, #(24 + 8)] - .endif -@@ -986,89 +988,89 @@ 9: - */ - W .req r0 /* width (is updated during processing) */ - DST_W .req r1 /* destination buffer pointer for writes */ - SRC .req r2 /* source buffer pointer */ - DST_R .req ip /* destination buffer pointer for reads */ - MASK .req r3 /* mask pointer */ - - .macro pixld_src x:vararg -- pixld x -+ pixld \x - .endm - .endif - --.if (((flags) & FLAG_DST_READWRITE) != 0) -+.if (((\flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp - .else - .set dst_r_bpp, 0 - .endif --.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) -+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) - .set DEINTERLEAVE_32BPP_ENABLED, 1 - .else - .set DEINTERLEAVE_32BPP_ENABLED, 0 - .endif - - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC - .endm - -- init -+ \init - mov DST_R, DST_W - - cmp W, #pixblock_size - blt 8f - -- ensure_destination_ptr_alignment process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ ensure_destination_ptr_alignment \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - - subs W, W, #pixblock_size - blt 7f - - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ - pixld_a pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -- process_pixblock_head -+ \process_pixblock_head - subs W, W, #pixblock_size - blt 2f - 1: -- process_pixblock_tail_head -+ \process_pixblock_tail_head - subs W, W, #pixblock_size - bge 1b - 2: -- process_pixblock_tail -+ \process_pixblock_tail - pixst_a pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - 7: - /* Process the remaining trailing pixels in the scanline (dst aligned) */ - process_trailing_pixels 0, 1, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - -- cleanup --.if use_nearest_scaling != 0 -+ \cleanup -+.if \use_nearest_scaling != 0 - pop {r4-r8, pc} /* exit */ - .else - bx lr /* exit */ - .endif - 8: - /* Process the remaining trailing pixels in the scanline (dst unaligned) */ - process_trailing_pixels 0, 0, \ -- process_pixblock_head, \ -- process_pixblock_tail, \ -- process_pixblock_tail_head -+ \process_pixblock_head, \ -+ \process_pixblock_tail, \ -+ \process_pixblock_tail_head - -- cleanup -+ \cleanup - --.if use_nearest_scaling != 0 -+.if \use_nearest_scaling != 0 - pop {r4-r8, pc} /* exit */ - - .unreq DST_R - .unreq SRC - .unreq W - .unreq VX - .unreq UNIT_X - .unreq TMP1 -@@ -1085,25 +1087,25 @@ 8: - .unreq DST_R - .unreq DST_W - .unreq W - .endif - - .purgem fetch_src_pixblock - .purgem pixld_src - -- .endfunc -+ pixman_end_asm_function - .endm - - .macro generate_composite_function_single_scanline x:vararg -- generate_composite_function_scanline 0, x -+ generate_composite_function_scanline 0, \x - .endm - - .macro generate_composite_function_nearest_scanline x:vararg -- generate_composite_function_scanline 1, x -+ generate_composite_function_scanline 1, \x - .endm - - /* Default prologue/epilogue, nothing special needs to be done */ - - .macro default_init - .endm - - .macro default_cleanup -@@ -1129,56 +1131,56 @@ 8: - * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) - * into a planar a8r8g8b8 format (with a, r, g, b color components - * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). - * - * Warning: the conversion is destructive and the original - * value (in) is lost. - */ - .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b -- vshrn.u16 out_r, in, #8 -- vshrn.u16 out_g, in, #3 -- vsli.u16 in, in, #5 -- vmov.u8 out_a, #255 -- vsri.u8 out_r, out_r, #5 -- vsri.u8 out_g, out_g, #6 -- vshrn.u16 out_b, in, #2 -+ vshrn.u16 \out_r, \in, #8 -+ vshrn.u16 \out_g, \in, #3 -+ vsli.u16 \in, \in, #5 -+ vmov.u8 \out_a, #255 -+ vsri.u8 \out_r, \out_r, #5 -+ vsri.u8 \out_g, \out_g, #6 -+ vshrn.u16 \out_b, \in, #2 - .endm - - .macro convert_0565_to_x888 in, out_r, out_g, out_b -- vshrn.u16 out_r, in, #8 -- vshrn.u16 out_g, in, #3 -- vsli.u16 in, in, #5 -- vsri.u8 out_r, out_r, #5 -- vsri.u8 out_g, out_g, #6 -- vshrn.u16 out_b, in, #2 -+ vshrn.u16 \out_r, \in, #8 -+ vshrn.u16 \out_g, \in, #3 -+ vsli.u16 \in, \in, #5 -+ vsri.u8 \out_r, \out_r, #5 -+ vsri.u8 \out_g, \out_g, #6 -+ vshrn.u16 \out_b, \in, #2 - .endm - - /* - * Conversion from planar a8r8g8b8 format (with a, r, g, b color components - * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 - * pixels packed in 128-bit register (out). Requires two temporary 128-bit - * registers (tmp1, tmp2) - */ - .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 -- vshll.u8 tmp1, in_g, #8 -- vshll.u8 out, in_r, #8 -- vshll.u8 tmp2, in_b, #8 -- vsri.u16 out, tmp1, #5 -- vsri.u16 out, tmp2, #11 -+ vshll.u8 \tmp1, \in_g, #8 -+ vshll.u8 \out, \in_r, #8 -+ vshll.u8 \tmp2, \in_b, #8 -+ vsri.u16 \out, \tmp1, #5 -+ vsri.u16 \out, \tmp2, #11 - .endm - - /* - * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels - * returned in (out0, out1) registers pair. Requires one temporary - * 64-bit register (tmp). 'out1' and 'in' may overlap, the original - * value from 'in' is lost - */ - .macro convert_four_0565_to_x888_packed in, out0, out1, tmp -- vshl.u16 out0, in, #5 /* G top 6 bits */ -- vshl.u16 tmp, in, #11 /* B top 5 bits */ -- vsri.u16 in, in, #5 /* R is ready in top bits */ -- vsri.u16 out0, out0, #6 /* G is ready in top bits */ -- vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ -- vshr.u16 out1, in, #8 /* R is in place */ -- vsri.u16 out0, tmp, #8 /* G & B is in place */ -- vzip.u16 out0, out1 /* everything is in place */ -+ vshl.u16 \out0, \in, #5 /* G top 6 bits */ -+ vshl.u16 \tmp, \in, #11 /* B top 5 bits */ -+ vsri.u16 \in, \in, #5 /* R is ready in top bits */ -+ vsri.u16 \out0, \out0, #6 /* G is ready in top bits */ -+ vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */ -+ vshr.u16 \out1, \in, #8 /* R is in place */ -+ vsri.u16 \out0, \tmp, #8 /* G & B is in place */ -+ vzip.u16 \out0, \out1 /* everything is in place */ - .endm -diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S ---- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S -+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S -@@ -20,16 +20,20 @@ - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Jeff Muizelaar (jeff@infidigm.net) - * - */ - -+#ifdef __clang__ -+#define subpls subspl -+#endif -+ - /* Prevent the stack from becoming executable */ - #if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits - #endif - - .text - .arch armv6 - .object_arch armv4 -@@ -57,100 +61,105 @@ - * prefetch_braking_distance - stop prefetching when that many pixels are - * remaining before the end of scanline - */ - - .macro generate_nearest_scanline_func fname, bpp_shift, t, \ - prefetch_distance, \ - prefetch_braking_distance - --pixman_asm_function fname -+pixman_asm_function \fname - W .req r0 - DST .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - TMP1 .req r4 - TMP2 .req r5 - VXMASK .req r6 - PF_OFFS .req r7 - SRC_WIDTH_FIXED .req r8 - - ldr UNIT_X, [sp] - push {r4, r5, r6, r7, r8, r10} -- mvn VXMASK, #((1 << bpp_shift) - 1) -+ mvn VXMASK, #((1 << \bpp_shift) - 1) - ldr SRC_WIDTH_FIXED, [sp, #28] - - /* define helper macro */ - .macro scale_2_pixels -- ldr&t TMP1, [SRC, TMP1] -- and TMP2, VXMASK, VX, asr #(16 - bpp_shift) -+ ldr\()\t TMP1, [SRC, TMP1] -+ and TMP2, VXMASK, VX, asr #(16 - \bpp_shift) - adds VX, VX, UNIT_X -- str&t TMP1, [DST], #(1 << bpp_shift) -+ str\()\t TMP1, [DST], #(1 << \bpp_shift) - 9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - -- ldr&t TMP2, [SRC, TMP2] -- and TMP1, VXMASK, VX, asr #(16 - bpp_shift) -+ ldr\()\t TMP2, [SRC, TMP2] -+ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) - adds VX, VX, UNIT_X -- str&t TMP2, [DST], #(1 << bpp_shift) -+ str\()\t TMP2, [DST], #(1 << \bpp_shift) - 9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b - .endm - - /* now do the scaling */ -- and TMP1, VXMASK, VX, asr #(16 - bpp_shift) -+ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) - adds VX, VX, UNIT_X - 9: subpls VX, VX, SRC_WIDTH_FIXED - bpl 9b -- subs W, W, #(8 + prefetch_braking_distance) -+ subs W, W, #(8 + \prefetch_braking_distance) - blt 2f - /* calculate prefetch offset */ -- mov PF_OFFS, #prefetch_distance -+ mov PF_OFFS, #\prefetch_distance - mla PF_OFFS, UNIT_X, PF_OFFS, VX - 1: /* main loop, process 8 pixels per iteration with prefetch */ -- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] -+ pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)] - add PF_OFFS, UNIT_X, lsl #3 - scale_2_pixels - scale_2_pixels - scale_2_pixels - scale_2_pixels - subs W, W, #8 - bge 1b - 2: -- subs W, W, #(4 - 8 - prefetch_braking_distance) -+ subs W, W, #(4 - 8 - \prefetch_braking_distance) - blt 2f - 1: /* process the remaining pixels */ - scale_2_pixels - scale_2_pixels - subs W, W, #4 - bge 1b - 2: - tst W, #2 - beq 2f - scale_2_pixels - 2: - tst W, #1 -- ldrne&t TMP1, [SRC, TMP1] -- strne&t TMP1, [DST] -+#ifdef __clang__ -+ ldr\()\t\()ne TMP1, [SRC, TMP1] -+ str\()\t\()ne TMP1, [DST] -+#else -+ ldrne\()\t TMP1, [SRC, TMP1] -+ strne\()\t TMP1, [DST] -+#endif - /* cleanup helper macro */ - .purgem scale_2_pixels - .unreq DST - .unreq SRC - .unreq W - .unreq VX - .unreq UNIT_X - .unreq TMP1 - .unreq TMP2 - .unreq VXMASK - .unreq PF_OFFS - .unreq SRC_WIDTH_FIXED - /* return */ - pop {r4, r5, r6, r7, r8, r10} - bx lr --.endfunc -+pixman_end_asm_function - .endm - - generate_nearest_scanline_func \ - pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 - - generate_nearest_scanline_func \ - pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 -diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S ---- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S -+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S -@@ -20,16 +20,21 @@ - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Ben Avison (bavison@riscosopen.org) - * - */ - -+#ifdef __clang__ -+#define adceqs adcseq -+#define ldmnedb ldmdbne -+#endif -+ - /* Prevent the stack from becoming executable */ - #if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits - #endif - - .text - .arch armv6 - .object_arch armv4 -@@ -52,26 +57,26 @@ - * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output - */ - - .macro blit_init - line_saved_regs STRIDE_D, STRIDE_S - .endm - - .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- pixld cond, numbytes, firstreg, SRC, unaligned_src -+ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src - .endm - - .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req MASK - WK7 .req STRIDE_M --110: pixld , 16, 0, SRC, unaligned_src -- pixld , 16, 4, SRC, unaligned_src -+110: pixld , 16, 0, SRC, \unaligned_src -+ pixld , 16, 4, SRC, \unaligned_src - pld [SRC, SCRATCH] - pixst , 16, 0, DST - pixst , 16, 4, DST - subs X, X, #32*8/src_bpp - bhs 110b - .unreq WK4 - .unreq WK5 - .unreq WK6 -@@ -137,17 +142,17 @@ generate_composite_function \ - mov STRIDE_M, SRC - .endm - - .macro fill_process_tail cond, numbytes, firstreg - WK4 .req SRC - WK5 .req STRIDE_S - WK6 .req MASK - WK7 .req STRIDE_M -- pixst cond, numbytes, 4, DST -+ pixst \cond, \numbytes, 4, DST - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 - .endm - - generate_composite_function \ - pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ -@@ -177,30 +182,30 @@ generate_composite_function \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - fill_process_tail - - /******************************************************************************/ - - .macro src_x888_8888_pixel, cond, reg -- orr&cond WK®, WK®, #0xFF000000 -+ orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000 - .endm - - .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- pixld cond, numbytes, firstreg, SRC, unaligned_src -+ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src - .endm - - .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg -- src_x888_8888_pixel cond, %(firstreg+0) -- .if numbytes >= 8 -- src_x888_8888_pixel cond, %(firstreg+1) -- .if numbytes == 16 -- src_x888_8888_pixel cond, %(firstreg+2) -- src_x888_8888_pixel cond, %(firstreg+3) -+ src_x888_8888_pixel \cond, %(\firstreg+0) -+ .if \numbytes >= 8 -+ src_x888_8888_pixel \cond, %(\firstreg+1) -+ .if \numbytes == 16 -+ src_x888_8888_pixel \cond, %(\firstreg+2) -+ src_x888_8888_pixel \cond, %(\firstreg+3) - .endif - .endif - .endm - - generate_composite_function \ - pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 3, /* prefetch distance */ \ -@@ -217,83 +222,83 @@ generate_composite_function \ - ldr MASK, =0x07E007E0 - mov STRIDE_M, #0xFF000000 - /* Set GE[3:0] to 1010 so SEL instructions do what we want */ - ldr SCRATCH, =0x80008000 - uadd8 SCRATCH, SCRATCH, SCRATCH - .endm - - .macro src_0565_8888_2pixels, reg1, reg2 -- and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 -- bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb -- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg -- mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 -- mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG -- bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 -- orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 -- orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 -- pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- -- sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- -- mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg -- pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- -- sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- -- orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb -- orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB -+ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 -+ bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb -+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg -+ mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000 -+ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG -+ bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000 -+ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 -+ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 -+ pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- -+ sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- -+ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg -+ pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- -+ sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- -+ orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb -+ orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB - .endm - - /* This version doesn't need STRIDE_M, but is one instruction longer. - It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? -- and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 -- bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb -- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg -- mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB -- mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 -- bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb -- mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 -- mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 -- orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB -- orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb -- pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB -- pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb -- sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB -- sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb -- orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB -- orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb -+ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 -+ bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb -+ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg -+ mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB -+ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 -+ bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb -+ mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 -+ mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 -+ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB -+ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb -+ pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB -+ pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb -+ sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB -+ sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb -+ orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB -+ orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb - */ - - .macro src_0565_8888_1pixel, reg -- bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb -- and WK®, WK®, MASK @ 000000000000000000000gggggg00000 -- mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 -- mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 -- orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb -- orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 -- pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb -- sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb -- orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb -+ bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb -+ and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000 -+ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 -+ mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000 -+ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb -+ orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000 -+ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb -+ sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb -+ orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb - .endm - - .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- .if numbytes == 16 -- pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src -- .elseif numbytes == 8 -- pixld , 4, firstreg, SRC, unaligned_src -- .elseif numbytes == 4 -- pixld , 2, firstreg, SRC, unaligned_src -+ .if \numbytes == 16 -+ pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src -+ .elseif \numbytes == 8 -+ pixld , 4, \firstreg, SRC, \unaligned_src -+ .elseif \numbytes == 4 -+ pixld , 2, \firstreg, SRC, \unaligned_src - .endif - .endm - - .macro src_0565_8888_process_tail cond, numbytes, firstreg -- .if numbytes == 16 -- src_0565_8888_2pixels firstreg, %(firstreg+1) -- src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) -- .elseif numbytes == 8 -- src_0565_8888_2pixels firstreg, %(firstreg+1) -+ .if \numbytes == 16 -+ src_0565_8888_2pixels \firstreg, %(\firstreg+1) -+ src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3) -+ .elseif \numbytes == 8 -+ src_0565_8888_2pixels \firstreg, %(\firstreg+1) - .else -- src_0565_8888_1pixel firstreg -+ src_0565_8888_1pixel \firstreg - .endif - .endm - - generate_composite_function \ - pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ - 3, /* prefetch distance */ \ - src_0565_8888_init, \ -@@ -306,67 +311,67 @@ generate_composite_function \ - - .macro src_x888_0565_init - /* Hold loop invariant in MASK */ - ldr MASK, =0x001F001F - line_saved_regs STRIDE_S, ORIG_W - .endm - - .macro src_x888_0565_1pixel s, d -- and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb -- and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000 -- orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb -- orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb -+ and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb -+ and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000 -+ orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb -+ orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb - /* Top 16 bits are discarded during the following STRH */ - .endm - - .macro src_x888_0565_2pixels slo, shi, d, tmp -- and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 -- and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB -- and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb -- orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB -- orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB -- and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000 -- orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb -- orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb -- pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb -+ and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 -+ and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB -+ and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb -+ orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB -+ orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB -+ and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000 -+ orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb -+ orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb -+ pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb - .endm - - .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_S - WK5 .req STRIDE_M - WK6 .req WK3 - WK7 .req ORIG_W -- .if numbytes == 16 -+ .if \numbytes == 16 - pixld , 16, 4, SRC, 0 - src_x888_0565_2pixels 4, 5, 0, 0 - pixld , 8, 4, SRC, 0 - src_x888_0565_2pixels 6, 7, 1, 1 - pixld , 8, 6, SRC, 0 - .else -- pixld , numbytes*2, 4, SRC, 0 -+ pixld , \numbytes*2, 4, SRC, 0 - .endif - .endm - - .macro src_x888_0565_process_tail cond, numbytes, firstreg -- .if numbytes == 16 -+ .if \numbytes == 16 - src_x888_0565_2pixels 4, 5, 2, 2 - src_x888_0565_2pixels 6, 7, 3, 4 -- .elseif numbytes == 8 -+ .elseif \numbytes == 8 - src_x888_0565_2pixels 4, 5, 1, 1 - src_x888_0565_2pixels 6, 7, 2, 2 -- .elseif numbytes == 4 -+ .elseif \numbytes == 4 - src_x888_0565_2pixels 4, 5, 1, 1 - .else - src_x888_0565_1pixel 4, 1 - .endif -- .if numbytes == 16 -- pixst , numbytes, 0, DST -+ .if \numbytes == 16 -+ pixst , \numbytes, 0, DST - .else -- pixst , numbytes, 1, DST -+ pixst , \numbytes, 1, DST - .endif - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 - .endm - - generate_composite_function \ -@@ -377,47 +382,47 @@ generate_composite_function \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - src_x888_0565_process_head, \ - src_x888_0565_process_tail - - /******************************************************************************/ - - .macro add_8_8_8pixels cond, dst1, dst2 -- uqadd8&cond WK&dst1, WK&dst1, MASK -- uqadd8&cond WK&dst2, WK&dst2, STRIDE_M -+ uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK -+ uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M - .endm - - .macro add_8_8_4pixels cond, dst -- uqadd8&cond WK&dst, WK&dst, MASK -+ uqadd8\()\cond WK\()\dst, WK\()\dst, MASK - .endm - - .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req MASK - WK5 .req STRIDE_M -- .if numbytes == 16 -- pixld cond, 8, 4, SRC, unaligned_src -- pixld cond, 16, firstreg, DST, 0 -- add_8_8_8pixels cond, firstreg, %(firstreg+1) -- pixld cond, 8, 4, SRC, unaligned_src -+ .if \numbytes == 16 -+ pixld \cond, 8, 4, SRC, \unaligned_src -+ pixld \cond, 16, \firstreg, DST, 0 -+ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) -+ pixld \cond, 8, 4, SRC, \unaligned_src - .else -- pixld cond, numbytes, 4, SRC, unaligned_src -- pixld cond, numbytes, firstreg, DST, 0 -+ pixld \cond, \numbytes, 4, SRC, \unaligned_src -+ pixld \cond, \numbytes, \firstreg, DST, 0 - .endif - .unreq WK4 - .unreq WK5 - .endm - - .macro add_8_8_process_tail cond, numbytes, firstreg -- .if numbytes == 16 -- add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) -- .elseif numbytes == 8 -- add_8_8_8pixels cond, firstreg, %(firstreg+1) -+ .if \numbytes == 16 -+ add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3) -+ .elseif \numbytes == 8 -+ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) - .else -- add_8_8_4pixels cond, firstreg -+ add_8_8_4pixels \cond, \firstreg - .endif - .endm - - generate_composite_function \ - pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 2, /* prefetch distance */ \ - nop_macro, /* init */ \ -@@ -436,82 +441,82 @@ generate_composite_function \ - line_saved_regs STRIDE_D, STRIDE_S, ORIG_W - .endm - - .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req STRIDE_M - WK7 .req ORIG_W -- pixld , numbytes, %(4+firstreg), SRC, unaligned_src -- pixld , numbytes, firstreg, DST, 0 -+ pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src -+ pixld , \numbytes, \firstreg, DST, 0 - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 - .endm - - .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 - /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ -- teq WK®0, #0 -- .if numbytes > 4 -- teqeq WK®1, #0 -- .if numbytes > 8 -- teqeq WK®2, #0 -- teqeq WK®3, #0 -+ teq WK\()\reg0, #0 -+ .if \numbytes > 4 -+ teqeq WK\()\reg1, #0 -+ .if \numbytes > 8 -+ teqeq WK\()\reg2, #0 -+ teqeq WK\()\reg3, #0 - .endif - .endif - .endm - - .macro over_8888_8888_prepare next -- mov WK&next, WK&next, lsr #24 -+ mov WK\()\next, WK\()\next, lsr #24 - .endm - - .macro over_8888_8888_1pixel src, dst, offset, next - /* src = destination component multiplier */ -- rsb WK&src, WK&src, #255 -+ rsb WK\()\src, WK\()\src, #255 - /* Split even/odd bytes of dst into SCRATCH/dst */ -- uxtb16 SCRATCH, WK&dst -- uxtb16 WK&dst, WK&dst, ror #8 -+ uxtb16 SCRATCH, WK\()\dst -+ uxtb16 WK\()\dst, WK\()\dst, ror #8 - /* Multiply through, adding 0.5 to the upper byte of result for rounding */ -- mla SCRATCH, SCRATCH, WK&src, MASK -- mla WK&dst, WK&dst, WK&src, MASK -+ mla SCRATCH, SCRATCH, WK\()\src, MASK -+ mla WK\()\dst, WK\()\dst, WK\()\src, MASK - /* Where we would have had a stall between the result of the first MLA and the shifter input, - * reload the complete source pixel */ -- ldr WK&src, [SRC, #offset] -+ ldr WK\()\src, [SRC, #\offset] - /* Multiply by 257/256 to approximate 256/255 */ - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - /* In this stall, start processing the next pixel */ -- .if offset < -4 -- mov WK&next, WK&next, lsr #24 -+ .if \offset < -4 -+ mov WK\()\next, WK\()\next, lsr #24 - .endif -- uxtab16 WK&dst, WK&dst, WK&dst, ror #8 -+ uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8 - /* Recombine even/odd bytes of multiplied destination */ - mov SCRATCH, SCRATCH, ror #8 -- sel WK&dst, SCRATCH, WK&dst -+ sel WK\()\dst, SCRATCH, WK\()\dst - /* Saturated add of source to multiplied destination */ -- uqadd8 WK&dst, WK&dst, WK&src -+ uqadd8 WK\()\dst, WK\()\dst, WK\()\src - .endm - - .macro over_8888_8888_process_tail cond, numbytes, firstreg - WK4 .req STRIDE_D - WK5 .req STRIDE_S - WK6 .req STRIDE_M - WK7 .req ORIG_W -- over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) -+ over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg) - beq 10f -- over_8888_8888_prepare %(4+firstreg) -- .set PROCESS_REG, firstreg -- .set PROCESS_OFF, -numbytes -- .rept numbytes / 4 -+ over_8888_8888_prepare %(4+\firstreg) -+ .set PROCESS_REG, \firstreg -+ .set PROCESS_OFF, -\numbytes -+ .rept \numbytes / 4 - over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .set PROCESS_OFF, PROCESS_OFF+4 - .endr -- pixst , numbytes, firstreg, DST -+ pixst , \numbytes, \firstreg, DST - 10: - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 - .endm - - generate_composite_function \ -@@ -531,26 +536,26 @@ generate_composite_function \ - * word Register containing 4 bytes - * byte Register containing byte multiplier (bits 8-31 must be 0) - * tmp Scratch register - * half Register containing the constant 0x00800080 - * GE[3:0] bits must contain 0101 - */ - .macro mul_8888_8 word, byte, tmp, half - /* Split even/odd bytes of word apart */ -- uxtb16 tmp, word -- uxtb16 word, word, ror #8 -+ uxtb16 \tmp, \word -+ uxtb16 \word, \word, ror #8 - /* Multiply bytes together with rounding, then by 257/256 */ -- mla tmp, tmp, byte, half -- mla word, word, byte, half /* 1 stall follows */ -- uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ -- uxtab16 word, word, word, ror #8 -+ mla \tmp, \tmp, \byte, \half -+ mla \word, \word, \byte, \half /* 1 stall follows */ -+ uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */ -+ uxtab16 \word, \word, \word, ror #8 - /* Recombine bytes */ -- mov tmp, tmp, ror #8 -- sel word, tmp, word -+ mov \tmp, \tmp, ror #8 -+ sel \word, \tmp, \word - .endm - - /******************************************************************************/ - - .macro over_8888_n_8888_init - /* Mask is constant */ - ldr MASK, [sp, #ARGS_STACK_OFFSET+8] - /* Hold loop invariant in STRIDE_M */ -@@ -562,51 +567,51 @@ generate_composite_function \ - line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W - .endm - - .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req Y - WK5 .req STRIDE_D - WK6 .req STRIDE_S - WK7 .req ORIG_W -- pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src -- pixld , numbytes, firstreg, DST, 0 -+ pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src -+ pixld , \numbytes, \firstreg, DST, 0 - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 - .endm - - .macro over_8888_n_8888_1pixel src, dst -- mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M -- sub WK7, WK6, WK&src, lsr #24 -- mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M -- uqadd8 WK&dst, WK&dst, WK&src -+ mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M -+ sub WK7, WK6, WK\()\src, lsr #24 -+ mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M -+ uqadd8 WK\()\dst, WK\()\dst, WK\()\src - .endm - - .macro over_8888_n_8888_process_tail cond, numbytes, firstreg - WK4 .req Y - WK5 .req STRIDE_D - WK6 .req STRIDE_S - WK7 .req ORIG_W -- over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) -+ over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg) - beq 10f - mov WK6, #255 -- .set PROCESS_REG, firstreg -- .rept numbytes / 4 -- .if numbytes == 16 && PROCESS_REG == 2 -+ .set PROCESS_REG, \firstreg -+ .rept \numbytes / 4 -+ .if \numbytes == 16 && PROCESS_REG == 2 - /* We're using WK6 and WK7 as temporaries, so half way through - * 4 pixels, reload the second two source pixels but this time - * into WK4 and WK5 */ - ldmdb SRC, {WK4, WK5} - .endif - over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .endr -- pixst , numbytes, firstreg, DST -+ pixst , \numbytes, \firstreg, DST - 10: - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 - .endm - - generate_composite_function \ -@@ -637,47 +642,47 @@ generate_composite_function \ - ldr STRIDE_D, =0x00800080 - b 1f - .ltorg - 1: - .endm - - .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - WK4 .req STRIDE_M -- pixld , numbytes/4, 4, MASK, unaligned_mask -- pixld , numbytes, firstreg, DST, 0 -+ pixld , \numbytes/4, 4, MASK, \unaligned_mask -+ pixld , \numbytes, \firstreg, DST, 0 - .unreq WK4 - .endm - - .macro over_n_8_8888_1pixel src, dst -- uxtb Y, WK4, ror #src*8 -+ uxtb Y, WK4, ror #\src*8 - /* Trailing part of multiplication of source */ - mla SCRATCH, STRIDE_S, Y, STRIDE_D - mla Y, SRC, Y, STRIDE_D - mov ORIG_W, #255 - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 Y, Y, Y, ror #8 - mov SCRATCH, SCRATCH, ror #8 - sub ORIG_W, ORIG_W, Y, lsr #24 - sel Y, SCRATCH, Y - /* Then multiply the destination */ -- mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D -- uqadd8 WK&dst, WK&dst, Y -+ mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D -+ uqadd8 WK\()\dst, WK\()\dst, Y - .endm - - .macro over_n_8_8888_process_tail cond, numbytes, firstreg - WK4 .req STRIDE_M - teq WK4, #0 - beq 10f -- .set PROCESS_REG, firstreg -- .rept numbytes / 4 -- over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) -+ .set PROCESS_REG, \firstreg -+ .rept \numbytes / 4 -+ over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .endr -- pixst , numbytes, firstreg, DST -+ pixst , \numbytes, \firstreg, DST - 10: - .unreq WK4 - .endm - - generate_composite_function \ - pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ - 2, /* prefetch distance */ \ -@@ -700,64 +705,64 @@ generate_composite_function \ - line_saved_regs STRIDE_D, ORIG_W - .endm - - .macro over_reverse_n_8888_newline - mov STRIDE_D, #0xFF - .endm - - .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- pixld , numbytes, firstreg, DST, 0 -+ pixld , \numbytes, \firstreg, DST, 0 - .endm - - .macro over_reverse_n_8888_1pixel d, is_only -- teq WK&d, #0 -+ teq WK\()\d, #0 - beq 8f /* replace with source */ -- bics ORIG_W, STRIDE_D, WK&d, lsr #24 -- .if is_only == 1 -+ bics ORIG_W, STRIDE_D, WK\()\d, lsr #24 -+ .if \is_only == 1 - beq 49f /* skip store */ - .else - beq 9f /* write same value back */ - .endif - mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ - mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 - mov SCRATCH, SCRATCH, ror #8 - sel ORIG_W, SCRATCH, ORIG_W -- uqadd8 WK&d, WK&d, ORIG_W -+ uqadd8 WK\()\d, WK\()\d, ORIG_W - b 9f --8: mov WK&d, SRC -+8: mov WK\()\d, SRC - 9: - .endm - - .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 -- .if numbytes == 4 -- over_reverse_n_8888_1pixel reg1, 1 -+ .if \numbytes == 4 -+ over_reverse_n_8888_1pixel \reg1, 1 - .else -- and SCRATCH, WK®1, WK®2 -- .if numbytes == 16 -- and SCRATCH, SCRATCH, WK®3 -- and SCRATCH, SCRATCH, WK®4 -+ and SCRATCH, WK\()\reg1, WK\()\reg2 -+ .if \numbytes == 16 -+ and SCRATCH, SCRATCH, WK\()\reg3 -+ and SCRATCH, SCRATCH, WK\()\reg4 - .endif - mvns SCRATCH, SCRATCH, asr #24 - beq 49f /* skip store if all opaque */ -- over_reverse_n_8888_1pixel reg1, 0 -- over_reverse_n_8888_1pixel reg2, 0 -- .if numbytes == 16 -- over_reverse_n_8888_1pixel reg3, 0 -- over_reverse_n_8888_1pixel reg4, 0 -+ over_reverse_n_8888_1pixel \reg1, 0 -+ over_reverse_n_8888_1pixel \reg2, 0 -+ .if \numbytes == 16 -+ over_reverse_n_8888_1pixel \reg3, 0 -+ over_reverse_n_8888_1pixel \reg4, 0 - .endif - .endif -- pixst , numbytes, reg1, DST -+ pixst , \numbytes, \reg1, DST - 49: - .endm - - .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg -- over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) -+ over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) - .endm - - generate_composite_function \ - pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ - 3, /* prefetch distance */ \ - over_reverse_n_8888_init, \ - over_reverse_n_8888_newline, \ -@@ -789,30 +794,30 @@ generate_composite_function \ - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq WK4 - .endm - - .macro over_white_8888_8888_ca_combine m, d - uxtb16 TMP1, TMP0 /* rb_notmask */ -- uxtb16 TMP2, d /* rb_dest; 1 stall follows */ -+ uxtb16 TMP2, \d /* rb_dest; 1 stall follows */ - smlatt TMP3, TMP2, TMP1, HALF /* red */ - smlabb TMP2, TMP2, TMP1, HALF /* blue */ - uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ -- uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */ -- smlatt d, TMP1, TMP0, HALF /* alpha */ -+ uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */ -+ smlatt \d, TMP1, TMP0, HALF /* alpha */ - smlabb TMP1, TMP1, TMP0, HALF /* green */ - pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ -- pkhbt TMP1, TMP1, d, lsl #16 /* ag */ -+ pkhbt TMP1, TMP1, \d, lsl #16 /* ag */ - uxtab16 TMP0, TMP0, TMP0, ror #8 - uxtab16 TMP1, TMP1, TMP1, ror #8 - mov TMP0, TMP0, ror #8 -- sel d, TMP0, TMP1 -- uqadd8 d, d, m /* d is a late result */ -+ sel \d, TMP0, TMP1 -+ uqadd8 \d, \d, \m /* d is a late result */ - .endm - - .macro over_white_8888_8888_ca_1pixel_head - pixld , 4, 1, MASK, 0 - pixld , 4, 3, DST, 0 - .endm - - .macro over_white_8888_8888_ca_1pixel_tail -@@ -848,29 +853,29 @@ 02: mvn TMP0, WK2 - movcs WK4, WK2 - b 04f - 03: over_white_8888_8888_ca_combine WK2, WK4 - 04: pixst , 8, 3, DST - 05: - .endm - - .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- .if numbytes == 4 -+ .if \numbytes == 4 - over_white_8888_8888_ca_1pixel_head - .else -- .if numbytes == 16 -+ .if \numbytes == 16 - over_white_8888_8888_ca_2pixels_head - over_white_8888_8888_ca_2pixels_tail - .endif - over_white_8888_8888_ca_2pixels_head - .endif - .endm - - .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg -- .if numbytes == 4 -+ .if \numbytes == 4 - over_white_8888_8888_ca_1pixel_tail - .else - over_white_8888_8888_ca_2pixels_tail - .endif - .endm - - generate_composite_function \ - pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ -@@ -999,33 +1004,33 @@ 20: /* No simplifications possible - - uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ - 30: /* The destination buffer is already in the L1 cache, so - * there's little point in amalgamating writes */ - pixst , 4, 0, DST - 40: - .endm - - .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- .rept (numbytes / 4) - 1 -+ .rept (\numbytes / 4) - 1 - over_n_8888_8888_ca_1pixel_head - over_n_8888_8888_ca_1pixel_tail - .endr - over_n_8888_8888_ca_1pixel_head - .endm - - .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg - over_n_8888_8888_ca_1pixel_tail - .endm - - pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 - ldr ip, [sp] - cmp ip, #-1 - beq pixman_composite_over_white_8888_8888_ca_asm_armv6 - /* else drop through... */ -- .endfunc -+ pixman_end_asm_function - generate_composite_function \ - pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ - 2, /* prefetch distance */ \ - over_n_8888_8888_ca_init, \ - nop_macro, /* newline */ \ - over_n_8888_8888_ca_cleanup, \ - over_n_8888_8888_ca_process_head, \ -@@ -1040,94 +1045,94 @@ generate_composite_function \ - uadd8 SCRATCH, MASK, MASK - /* Offset the source pointer: we only need the alpha bytes */ - add SRC, SRC, #3 - line_saved_regs ORIG_W - .endm - - .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 - ldrb ORIG_W, [SRC], #4 -- .if numbytes >= 8 -- ldrb WK®1, [SRC], #4 -- .if numbytes == 16 -- ldrb WK®2, [SRC], #4 -- ldrb WK®3, [SRC], #4 -+ .if \numbytes >= 8 -+ ldrb WK\()\reg1, [SRC], #4 -+ .if \numbytes == 16 -+ ldrb WK\()\reg2, [SRC], #4 -+ ldrb WK\()\reg3, [SRC], #4 - .endif - .endif -- add DST, DST, #numbytes -+ add DST, DST, #\numbytes - .endm - - .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) -+ in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2) - .endm - - .macro in_reverse_8888_8888_1pixel s, d, offset, is_only -- .if is_only != 1 -- movs s, ORIG_W -- .if offset != 0 -- ldrb ORIG_W, [SRC, #offset] -+ .if \is_only != 1 -+ movs \s, ORIG_W -+ .if \offset != 0 -+ ldrb ORIG_W, [SRC, #\offset] - .endif - beq 01f - teq STRIDE_M, #0xFF - beq 02f - .endif -- uxtb16 SCRATCH, d /* rb_dest */ -- uxtb16 d, d, ror #8 /* ag_dest */ -- mla SCRATCH, SCRATCH, s, MASK -- mla d, d, s, MASK -+ uxtb16 SCRATCH, \d /* rb_dest */ -+ uxtb16 \d, \d, ror #8 /* ag_dest */ -+ mla SCRATCH, SCRATCH, \s, MASK -+ mla \d, \d, \s, MASK - uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 -- uxtab16 d, d, d, ror #8 -+ uxtab16 \d, \d, \d, ror #8 - mov SCRATCH, SCRATCH, ror #8 -- sel d, SCRATCH, d -+ sel \d, SCRATCH, \d - b 02f -- .if offset == 0 -+ .if \offset == 0 - 48: /* Last mov d,#0 of the set - used as part of shortcut for - * source values all 0 */ - .endif --01: mov d, #0 -+01: mov \d, #0 - 02: - .endm - - .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 -- .if numbytes == 4 -+ .if \numbytes == 4 - teq ORIG_W, ORIG_W, asr #32 -- ldrne WK®1, [DST, #-4] -- .elseif numbytes == 8 -- teq ORIG_W, WK®1 -+ ldrne WK\()\reg1, [DST, #-4] -+ .elseif \numbytes == 8 -+ teq ORIG_W, WK\()\reg1 - teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ -- ldmnedb DST, {WK®1-WK®2} -+ ldmnedb DST, {WK\()\reg1-WK\()\reg2} - .else -- teq ORIG_W, WK®1 -- teqeq ORIG_W, WK®2 -- teqeq ORIG_W, WK®3 -+ teq ORIG_W, WK\()\reg1 -+ teqeq ORIG_W, WK\()\reg2 -+ teqeq ORIG_W, WK\()\reg3 - teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ -- ldmnedb DST, {WK®1-WK®4} -+ ldmnedb DST, {WK\()\reg1-WK\()\reg4} - .endif - cmnne DST, #0 /* clear C if NE */ - bcs 49f /* no writes to dest if source all -1 */ - beq 48f /* set dest to all 0 if source all 0 */ -- .if numbytes == 4 -- in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 -- str WK®1, [DST, #-4] -- .elseif numbytes == 8 -- in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 -- in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 -- stmdb DST, {WK®1-WK®2} -+ .if \numbytes == 4 -+ in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1 -+ str WK\()\reg1, [DST, #-4] -+ .elseif \numbytes == 8 -+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0 -+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0 -+ stmdb DST, {WK\()\reg1-WK\()\reg2} - .else -- in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 -- in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 -- in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 -- in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 -- stmdb DST, {WK®1-WK®4} -+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0 -+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0 -+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0 -+ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0 -+ stmdb DST, {WK\()\reg1-WK\()\reg4} - .endif - 49: - .endm - - .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg -- in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) -+ in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) - .endm - - generate_composite_function \ - pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ - 2, /* prefetch distance */ \ - in_reverse_8888_8888_init, \ - nop_macro, /* newline */ \ -@@ -1144,31 +1149,31 @@ generate_composite_function \ - /* Hold multiplier for destination in STRIDE_M */ - mov STRIDE_M, #255 - sub STRIDE_M, STRIDE_M, SRC, lsr #24 - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - uadd8 SCRATCH, MASK, MASK - .endm - - .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload -- pixld , numbytes, firstreg, DST, 0 -+ pixld , \numbytes, \firstreg, DST, 0 - .endm - - .macro over_n_8888_1pixel dst -- mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK -- uqadd8 WK&dst, WK&dst, SRC -+ mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK -+ uqadd8 WK\()\dst, WK\()\dst, SRC - .endm - - .macro over_n_8888_process_tail cond, numbytes, firstreg -- .set PROCESS_REG, firstreg -- .rept numbytes / 4 -+ .set PROCESS_REG, \firstreg -+ .rept \numbytes / 4 - over_n_8888_1pixel %(PROCESS_REG) - .set PROCESS_REG, PROCESS_REG+1 - .endr -- pixst , numbytes, firstreg, DST -+ pixst , \numbytes, \firstreg, DST - .endm - - generate_composite_function \ - pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ - 2, /* prefetch distance */ \ - over_n_8888_init, \ - nop_macro, /* newline */ \ -diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h ---- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h -+++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h -@@ -107,88 +107,120 @@ - .set PREFETCH_TYPE_NONE, 0 - .set PREFETCH_TYPE_STANDARD, 1 - - /* - * Definitions of macros for load/store of pixel data. - */ - - .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 -- .if numbytes == 16 -- .if unaligned == 1 -- op&r&cond WK®0, [base], #4 -- op&r&cond WK®1, [base], #4 -- op&r&cond WK®2, [base], #4 -- op&r&cond WK®3, [base], #4 -+ .if \numbytes == 16 -+ .if \unaligned == 1 -+ \op\()r\()\cond WK\()\reg0, [\base], #4 -+ \op\()r\()\cond WK\()\reg1, [\base], #4 -+ \op\()r\()\cond WK\()\reg2, [\base], #4 -+ \op\()r\()\cond WK\()\reg3, [\base], #4 - .else -- op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} -+#ifdef __clang__ -+ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} -+#else -+ \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} -+#endif - .endif -- .elseif numbytes == 8 -- .if unaligned == 1 -- op&r&cond WK®0, [base], #4 -- op&r&cond WK®1, [base], #4 -+ .elseif \numbytes == 8 -+ .if \unaligned == 1 -+ \op\()r\()\cond WK\()\reg0, [\base], #4 -+ \op\()r\()\cond WK\()\reg1, [\base], #4 - .else -- op&m&cond&ia base!, {WK®0,WK®1} -+#ifdef __clang__ -+ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1} -+#else -+ \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1} -+#endif - .endif -- .elseif numbytes == 4 -- op&r&cond WK®0, [base], #4 -- .elseif numbytes == 2 -- op&r&cond&h WK®0, [base], #2 -- .elseif numbytes == 1 -- op&r&cond&b WK®0, [base], #1 -+ .elseif \numbytes == 4 -+ \op\()r\()\cond WK\()\reg0, [\base], #4 -+ .elseif \numbytes == 2 -+#ifdef __clang__ -+ \op\()rh\()\cond WK\()\reg0, [\base], #2 -+#else -+ \op\()r\()\cond\()h WK\()\reg0, [\base], #2 -+#endif -+ .elseif \numbytes == 1 -+#ifdef __clang__ -+ \op\()rb\()\cond WK\()\reg0, [\base], #1 -+#else -+ \op\()r\()\cond\()b WK\()\reg0, [\base], #1 -+#endif - .else -- .error "unsupported size: numbytes" -+ .error "unsupported size: \numbytes" - .endif - .endm - - .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base -- .if numbytes == 16 -- stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} -- .elseif numbytes == 8 -- stm&cond&db base, {WK®0,WK®1} -- .elseif numbytes == 4 -- str&cond WK®0, [base, #-4] -- .elseif numbytes == 2 -- str&cond&h WK®0, [base, #-2] -- .elseif numbytes == 1 -- str&cond&b WK®0, [base, #-1] -+ .if \numbytes == 16 -+#ifdef __clang__ -+ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} -+#else -+ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} -+#endif -+ .elseif \numbytes == 8 -+#ifdef __clang__ -+ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1} -+#else -+ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1} -+#endif -+ .elseif \numbytes == 4 -+ str\()\cond WK\()\reg0, [\base, #-4] -+ .elseif \numbytes == 2 -+#ifdef __clang__ -+ strh\()\cond WK\()\reg0, [\base, #-2] -+#else -+ str\()\cond\()h WK\()\reg0, [\base, #-2] -+#endif -+ .elseif \numbytes == 1 -+#ifdef __clang__ -+ strb\()\cond WK\()\reg0, [\base, #-1] -+#else -+ str\()\cond\()b WK\()\reg0, [\base, #-1] -+#endif - .else -- .error "unsupported size: numbytes" -+ .error "unsupported size: \numbytes" - .endif - .endm - - .macro pixld cond, numbytes, firstreg, base, unaligned -- pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned -+ pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned - .endm - - .macro pixst cond, numbytes, firstreg, base - .if (flags) & FLAG_DST_READWRITE -- pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base -+ pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base - .else -- pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base -+ pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base - .endif - .endm - - .macro PF a, x:vararg - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) -- a x -+ \a \x - .endif - .endm - - - .macro preload_leading_step1 bpp, ptr, base - /* If the destination is already 16-byte aligned, then we need to preload - * between 0 and prefetch_distance (inclusive) cache lines ahead so there - * are no gaps when the inner loop starts. - */ -- .if bpp > 0 -- PF bic, ptr, base, #31 -+ .if \bpp > 0 -+ PF bic, \ptr, \base, #31 - .set OFFSET, 0 - .rept prefetch_distance+1 -- PF pld, [ptr, #OFFSET] -+ PF pld, [\ptr, #OFFSET] - .set OFFSET, OFFSET+32 - .endr - .endif - .endm - - .macro preload_leading_step2 bpp, bpp_shift, ptr, base - /* However, if the destination is not 16-byte aligned, we may need to - * preload more cache lines than that. The question we need to ask is: -@@ -196,81 +228,81 @@ - * by which the source pointer will be rounded down for preloading, and if - * so, by how many cache lines? Effectively, we want to calculate - * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp - * inner_loop_offset = (src+leading_bytes)&31 - * extra_needed = leading_bytes - inner_loop_offset - * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only - * possible when there are 4 src bytes for every 1 dst byte). - */ -- .if bpp > 0 -- .ifc base,DST -+ .if \bpp > 0 -+ .ifc \base,DST - /* The test can be simplified further when preloading the destination */ -- PF tst, base, #16 -+ PF tst, \base, #16 - PF beq, 61f - .else -- .if bpp/dst_w_bpp == 4 -- PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift -+ .if \bpp/dst_w_bpp == 4 -+ PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift - PF and, SCRATCH, SCRATCH, #31 -- PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift -+ PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift - PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ - PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ - PF bcs, 61f - PF bpl, 60f - PF pld, [ptr, #32*(prefetch_distance+2)] - .else -- PF mov, SCRATCH, base, lsl #32-5 -- PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift -- PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift -+ PF mov, SCRATCH, \base, lsl #32-5 -+ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift -+ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift - PF bls, 61f - .endif - .endif --60: PF pld, [ptr, #32*(prefetch_distance+1)] -+60: PF pld, [\ptr, #32*(prefetch_distance+1)] - 61: - .endif - .endm - - #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) - .macro preload_middle bpp, base, scratch_holds_offset -- .if bpp > 0 -+ .if \bpp > 0 - /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ -- .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) -- .if scratch_holds_offset -- PF pld, [base, SCRATCH] -+ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp) -+ .if \scratch_holds_offset -+ PF pld, [\base, SCRATCH] - .else -- PF bic, SCRATCH, base, #31 -+ PF bic, SCRATCH, \base, #31 - PF pld, [SCRATCH, #32*prefetch_distance] - .endif - .endif - .endif - .endm - - .macro preload_trailing bpp, bpp_shift, base -- .if bpp > 0 -- .if bpp*pix_per_block > 256 -+ .if \bpp > 0 -+ .if \bpp*pix_per_block > 256 - /* Calculations are more complex if more than one fetch per block */ -- PF and, WK1, base, #31 -- PF add, WK1, WK1, WK0, lsl #bpp_shift -- PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) -- PF bic, SCRATCH, base, #31 -+ PF and, WK1, \base, #31 -+ PF add, WK1, WK1, WK0, lsl #\bpp_shift -+ PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1) -+ PF bic, SCRATCH, \base, #31 - 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] - PF add, SCRATCH, SCRATCH, #32 - PF subs, WK1, WK1, #32 - PF bhi, 80b - .else - /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ -- PF mov, SCRATCH, base, lsl #32-5 -- PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift -+ PF mov, SCRATCH, \base, lsl #32-5 -+ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift - PF adceqs, SCRATCH, SCRATCH, #0 - /* The instruction above has two effects: ensures Z is only - * set if C was clear (so Z indicates that both shifted quantities - * were 0), and clears C if Z was set (so C indicates that the sum - * of the shifted quantities was greater and not equal to 32) */ - PF beq, 82f -- PF bic, SCRATCH, base, #31 -+ PF bic, SCRATCH, \base, #31 - PF bcc, 81f - PF pld, [SCRATCH, #32*(prefetch_distance+2)] - 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] - 82: - .endif - .endif - .endm - -@@ -283,97 +315,97 @@ 82: - * pixels) they cannot possibly straddle more than 2 32-byte cachelines, - * meaning there's no need for a loop. - * "bpp" - number of bits per pixel in the channel (source, mask or - * destination) that's being preloaded, or 0 if this channel is not used - * for reading - * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) - * "base" - base address register of channel to preload (SRC, MASK or DST) - */ -- .if bpp > 0 -- .if narrow_case && (bpp <= dst_w_bpp) -+ .if \bpp > 0 -+ .if \narrow_case && (\bpp <= dst_w_bpp) - /* In these cases, each line for each channel is in either 1 or 2 cache lines */ -- PF bic, WK0, base, #31 -+ PF bic, WK0, \base, #31 - PF pld, [WK0] -- PF add, WK1, base, X, LSL #bpp_shift -+ PF add, WK1, \base, X, LSL #\bpp_shift - PF sub, WK1, WK1, #1 - PF bic, WK1, WK1, #31 - PF cmp, WK1, WK0 - PF beq, 90f - PF pld, [WK1] - 90: - .else -- PF bic, WK0, base, #31 -+ PF bic, WK0, \base, #31 - PF pld, [WK0] -- PF add, WK1, base, X, lsl #bpp_shift -+ PF add, WK1, \base, X, lsl #\bpp_shift - PF sub, WK1, WK1, #1 - PF bic, WK1, WK1, #31 - PF cmp, WK1, WK0 - PF beq, 92f - 91: PF add, WK0, WK0, #32 - PF cmp, WK0, WK1 - PF pld, [WK0] - PF bne, 91b - 92: - .endif - .endif - .endm - - - .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx -- process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 -- .if decrementx -- sub&cond X, X, #8*numbytes/dst_w_bpp -+ \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0 -+ .if \decrementx -+ sub\()\cond X, X, #8*\numbytes/dst_w_bpp - .endif -- process_tail cond, numbytes, firstreg -+ \process_tail \cond, \numbytes, \firstreg - .if !((flags) & FLAG_PROCESS_DOES_STORE) -- pixst cond, numbytes, firstreg, DST -+ pixst \cond, \numbytes, \firstreg, DST - .endif - .endm - - .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - .if (flags) & FLAG_BRANCH_OVER -- .ifc cond,mi -+ .ifc \cond,mi - bpl 100f - .endif -- .ifc cond,cs -+ .ifc \cond,cs - bcc 100f - .endif -- .ifc cond,ne -+ .ifc \cond,ne - beq 100f - .endif -- conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx -+ conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx - 100: - .else -- conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx -+ conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx - .endif - .endm - - .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx - .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) - /* Can't interleave reads and writes */ -- test -- conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx -+ \test -+ conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx - .if (flags) & FLAG_PROCESS_CORRUPTS_PSR -- test -+ \test - .endif -- conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx -+ conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx - .else - /* Can interleave reads and writes for better scheduling */ -- test -- process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 -- process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 -- .if decrementx -- sub&cond1 X, X, #8*numbytes1/dst_w_bpp -- sub&cond2 X, X, #8*numbytes2/dst_w_bpp -+ \test -+ \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0 -+ \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0 -+ .if \decrementx -+ sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp -+ sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp - .endif -- process_tail cond1, numbytes1, firstreg1 -- process_tail cond2, numbytes2, firstreg2 -- pixst cond1, numbytes1, firstreg1, DST -- pixst cond2, numbytes2, firstreg2, DST -+ \process_tail \cond1, \numbytes1, \firstreg1 -+ \process_tail \cond2, \numbytes2, \firstreg2 -+ pixst \cond1, \numbytes1, \firstreg1, DST -+ pixst \cond2, \numbytes2, \firstreg2, DST - .endif - .endm - - - .macro test_bits_1_0_ptr - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ - .else -@@ -395,22 +427,22 @@ 100: - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - .set DECREMENT_X, 0 - sub X, X, WK0, lsr #dst_bpp_shift - str X, [sp, #LINE_SAVED_REG_COUNT*4] - mov X, WK0 - .endif - /* Use unaligned loads in all cases for simplicity */ - .if dst_w_bpp == 8 -- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X -+ conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X - .elseif dst_w_bpp == 16 - test_bits_1_0_ptr -- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X -+ conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X - .endif -- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X -+ conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - ldr X, [sp, #LINE_SAVED_REG_COUNT*4] - .endif - .endm - - .macro test_bits_3_2_pix - movs SCRATCH, X, lsl #dst_bpp_shift+32-3 - .endm -@@ -419,169 +451,169 @@ 100: - .if dst_w_bpp == 8 - movs SCRATCH, X, lsl #dst_bpp_shift+32-1 - .else - movs SCRATCH, X, lsr #1 - .endif - .endm - - .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -- conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 -+ conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0 - .if dst_w_bpp == 16 - test_bits_1_0_pix -- conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 -+ conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0 - .elseif dst_w_bpp == 8 -- conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 -+ conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0 - .endif - .endm - - - .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment - 110: - .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ - .rept pix_per_block*dst_w_bpp/128 -- process_head , 16, 0, unaligned_src, unaligned_mask, 1 -+ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1 - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - preload_middle src_bpp, SRC, 1 - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - preload_middle mask_bpp, MASK, 1 - .else - preload_middle src_bpp, SRC, 0 - preload_middle mask_bpp, MASK, 0 - .endif - .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) - /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that - * destination prefetches are 32-byte aligned. It's also the easiest channel to offset - * preloads for, to achieve staggered prefetches for multiple channels, because there are - * always two STMs per prefetch, so there is always an opposite STM on which to put the - * preload. Note, no need to BIC the base register here */ -- PF pld, [DST, #32*prefetch_distance - dst_alignment] -+ PF pld, [DST, #32*prefetch_distance - \dst_alignment] - .endif -- process_tail , 16, 0 -+ \process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - .set SUBBLOCK, SUBBLOCK+1 - .endr - subs X, X, #pix_per_block - bhs 110b - .endm - - .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask - /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ - .if dst_r_bpp > 0 - tst DST, #16 - bne 111f -- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS -+ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS - b 112f - 111: - .endif -- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS -+ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS - 112: - /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ - .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) - PF and, WK0, X, #pix_per_block-1 - .endif - preload_trailing src_bpp, src_bpp_shift, SRC - preload_trailing mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_trailing dst_r_bpp, dst_bpp_shift, DST - .endif - add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp - /* The remainder of the line is handled identically to the medium case */ -- medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask -+ medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask - .endm - - .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask - 120: -- process_head , 16, 0, unaligned_src, unaligned_mask, 0 -- process_tail , 16, 0 -+ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0 -+ \process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - subs X, X, #128/dst_w_bpp - bhs 120b - /* Trailing pixels */ - tst X, #128/dst_w_bpp - 1 -- beq exit_label -- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -+ beq \exit_label -+ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask - .endm - - .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask - tst X, #16*8/dst_w_bpp -- conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 -+ conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0 - /* Trailing pixels */ - /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ -- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -+ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask - .endm - - .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label - /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ - .if mask_bpp == 8 || mask_bpp == 16 - tst MASK, #3 - bne 141f - .endif - .if src_bpp == 8 || src_bpp == 16 - tst SRC, #3 - bne 140f - .endif -- action process_head, process_tail, process_inner_loop, exit_label, 0, 0 -+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0 - .if src_bpp == 8 || src_bpp == 16 -- b exit_label -+ b \exit_label - 140: -- action process_head, process_tail, process_inner_loop, exit_label, 1, 0 -+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0 - .endif - .if mask_bpp == 8 || mask_bpp == 16 -- b exit_label -+ b \exit_label - 141: - .if src_bpp == 8 || src_bpp == 16 - tst SRC, #3 - bne 142f - .endif -- action process_head, process_tail, process_inner_loop, exit_label, 0, 1 -+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1 - .if src_bpp == 8 || src_bpp == 16 -- b exit_label -+ b \exit_label - 142: -- action process_head, process_tail, process_inner_loop, exit_label, 1, 1 -+ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1 - .endif - .endif - .endm - - - .macro end_of_line restore_x, vars_spilled, loop_label, last_one -- .if vars_spilled -+ .if \vars_spilled - /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ - /* This is ldmia sp,{} */ - .word 0xE89D0000 | LINE_SAVED_REGS - .endif - subs Y, Y, #1 -- .if vars_spilled -+ .if \vars_spilled - .if (LINE_SAVED_REGS) & (1<<1) - str Y, [sp] - .endif - .endif - add DST, DST, STRIDE_D - .if src_bpp > 0 - add SRC, SRC, STRIDE_S - .endif - .if mask_bpp > 0 - add MASK, MASK, STRIDE_M - .endif -- .if restore_x -+ .if \restore_x - mov X, ORIG_W - .endif -- bhs loop_label -- .ifc "last_one","" -- .if vars_spilled -+ bhs \loop_label -+ .ifc "\last_one","" -+ .if \vars_spilled - b 197f - .else - b 198f - .endif - .else -- .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) -+ .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) - b 198f - .endif - .endif - .endm - - - .macro generate_composite_function fname, \ - src_bpp_, \ -@@ -591,27 +623,27 @@ 142: - prefetch_distance_, \ - init, \ - newline, \ - cleanup, \ - process_head, \ - process_tail, \ - process_inner_loop - -- pixman_asm_function fname -+ pixman_asm_function \fname - - /* - * Make some macro arguments globally visible and accessible - * from other macros - */ -- .set src_bpp, src_bpp_ -- .set mask_bpp, mask_bpp_ -- .set dst_w_bpp, dst_w_bpp_ -- .set flags, flags_ -- .set prefetch_distance, prefetch_distance_ -+ .set src_bpp, \src_bpp_ -+ .set mask_bpp, \mask_bpp_ -+ .set dst_w_bpp, \dst_w_bpp_ -+ .set flags, \flags_ -+ .set prefetch_distance, \prefetch_distance_ - - /* - * Select prefetch type for this function. - */ - .if prefetch_distance == 0 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - .else - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD -@@ -727,17 +759,17 @@ 142: - .endif - - #ifdef DEBUG_PARAMS - add Y, Y, #1 - stmia sp, {r0-r7,pc} - sub Y, Y, #1 - #endif - -- init -+ \init - - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - /* Reserve a word in which to store X during leading pixels */ - sub sp, sp, #4 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 - .endif - -@@ -768,47 +800,47 @@ 142: - mov ORIG_W, X - .if (flags) & FLAG_SPILL_LINE_VARS_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .endif - 151: /* New line */ -- newline -+ \newline - preload_leading_step1 src_bpp, WK1, SRC - preload_leading_step1 mask_bpp, WK2, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_leading_step1 dst_r_bpp, WK3, DST - .endif - - ands WK0, DST, #15 - beq 154f - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - - preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC - preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST - .endif - -- leading_15bytes process_head, process_tail -+ leading_15bytes \process_head, \process_tail - - 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - and SCRATCH, SRC, #31 - rsb SCRATCH, SCRATCH, #32*prefetch_distance - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - and SCRATCH, MASK, #31 - rsb SCRATCH, SCRATCH, #32*prefetch_distance - .endif -- .ifc "process_inner_loop","" -- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f -+ .ifc "\process_inner_loop","" -+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f - .else -- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f -+ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f - .endif - - 157: /* Check for another line */ - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b - .if (flags) & FLAG_SPILL_LINE_VARS_WIDE - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .endif -@@ -820,80 +852,80 @@ 160: /* Medium case */ - mov ORIG_W, X - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .endif - 161: /* New line */ -- newline -+ \newline - preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ - preload_line 0, mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_line 0, dst_r_bpp, dst_bpp_shift, DST - .endif - - sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ - ands WK0, DST, #15 - beq 164f - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - -- leading_15bytes process_head, process_tail -+ leading_15bytes \process_head, \process_tail - - 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ -- switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f -+ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f - - 167: /* Check for another line */ - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b - - .ltorg - - 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ - .if dst_w_bpp < 32 - mov ORIG_W, X - .endif - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .endif - 171: /* New line */ -- newline -+ \newline - preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ - preload_line 1, mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_line 1, dst_r_bpp, dst_bpp_shift, DST - .endif - - .if dst_w_bpp == 8 - tst DST, #3 - beq 174f - 172: subs X, X, #1 - blo 177f -- process_head , 1, 0, 1, 1, 0 -- process_tail , 1, 0 -+ \process_head , 1, 0, 1, 1, 0 -+ \process_tail , 1, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 1, 0, DST - .endif - tst DST, #3 - bne 172b - .elseif dst_w_bpp == 16 - tst DST, #2 - beq 174f - subs X, X, #1 - blo 177f -- process_head , 2, 0, 1, 1, 0 -- process_tail , 2, 0 -+ \process_head , 2, 0, 1, 1, 0 -+ \process_tail , 2, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 2, 0, DST - .endif - .endif - - 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ -- switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f -+ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f - - 177: /* Check for another line */ - end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .endif - -@@ -903,17 +935,17 @@ 197: - .endif - 198: - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 - add sp, sp, #4 - .endif - -- cleanup -+ \cleanup - - #ifdef DEBUG_PARAMS - add sp, sp, #9*4 /* junk the debug copy of arguments */ - #endif - 199: - pop {r4-r11, pc} /* exit */ - - .ltorg -@@ -927,23 +959,23 @@ 199: - .unreq MASK - .unreq STRIDE_M - .unreq WK0 - .unreq WK1 - .unreq WK2 - .unreq WK3 - .unreq SCRATCH - .unreq ORIG_W -- .endfunc -+ pixman_end_asm_function - .endm - - .macro line_saved_regs x:vararg - .set LINE_SAVED_REGS, 0 - .set LINE_SAVED_REG_COUNT, 0 -- .irp SAVED_REG,x -+ .irp SAVED_REG,\x - .ifc "SAVED_REG","Y" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_D" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif |