summaryrefslogtreecommitdiffstats
path: root/gfx/cairo/pixman-arm32-clang.patch
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /gfx/cairo/pixman-arm32-clang.patch
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/cairo/pixman-arm32-clang.patch')
-rw-r--r--gfx/cairo/pixman-arm32-clang.patch5205
1 files changed, 5205 insertions, 0 deletions
diff --git a/gfx/cairo/pixman-arm32-clang.patch b/gfx/cairo/pixman-arm32-clang.patch
new file mode 100644
index 0000000000..cd9d61e470
--- /dev/null
+++ b/gfx/cairo/pixman-arm32-clang.patch
@@ -0,0 +1,5205 @@
+https://gitlab.freedesktop.org/pixman/pixman/-/issues/74
+
+diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
+--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
++++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
+@@ -77,206 +77,206 @@
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+ .macro bilinear_load_8888 reg1, reg2, tmp
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+- vld1.32 {reg1}, [TMP1], STRIDE
+- vld1.32 {reg2}, [TMP1]
++ vld1.32 {\reg1}, [TMP1], STRIDE
++ vld1.32 {\reg2}, [TMP1]
+ .endm
+
+ .macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+- vld1.32 {reg2[0]}, [TMP1], STRIDE
+- vld1.32 {reg2[1]}, [TMP1]
+- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
++ vld1.32 {\reg2[0]}, [TMP1], STRIDE
++ vld1.32 {\reg2[1]}, [TMP1]
++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+- bilinear_load_8888 reg1, reg2, tmp1
+- vmull.u8 acc1, reg1, d28
+- vmlal.u8 acc1, reg2, d29
+- bilinear_load_8888 reg3, reg4, tmp2
+- vmull.u8 acc2, reg3, d28
+- vmlal.u8 acc2, reg4, d29
++ bilinear_load_8888 \reg1, \reg2, \tmp1
++ vmull.u8 \acc1, \reg1, d28
++ vmlal.u8 \acc1, \reg2, d29
++ bilinear_load_8888 \reg3, \reg4, \tmp2
++ vmull.u8 \acc2, \reg3, d28
++ vmlal.u8 \acc2, \reg4, d29
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+- vld1.32 {acc2lo[0]}, [TMP1], STRIDE
+- vld1.32 {acc2hi[0]}, [TMP2], STRIDE
+- vld1.32 {acc2lo[1]}, [TMP1]
+- vld1.32 {acc2hi[1]}, [TMP2]
+- convert_0565_to_x888 acc2, reg3, reg2, reg1
+- vzip.u8 reg1, reg3
+- vzip.u8 reg2, reg4
+- vzip.u8 reg3, reg4
+- vzip.u8 reg1, reg2
+- vmull.u8 acc1, reg1, d28
+- vmlal.u8 acc1, reg2, d29
+- vmull.u8 acc2, reg3, d28
+- vmlal.u8 acc2, reg4, d29
++ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
++ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
++ vld1.32 {\acc2lo[1]}, [TMP1]
++ vld1.32 {\acc2hi[1]}, [TMP2]
++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
++ vzip.u8 \reg1, \reg3
++ vzip.u8 \reg2, \reg4
++ vzip.u8 \reg3, \reg4
++ vzip.u8 \reg1, \reg2
++ vmull.u8 \acc1, \reg1, d28
++ vmlal.u8 \acc1, \reg2, d29
++ vmull.u8 \acc2, \reg3, d28
++ vmlal.u8 \acc2, \reg4, d29
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
+- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
+- vld1.32 {xacc2lo[1]}, [TMP1]
+- vld1.32 {xacc2hi[1]}, [TMP2]
+- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
++ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
++ vld1.32 {\xacc2lo[1]}, [TMP1]
++ vld1.32 {\xacc2hi[1]}, [TMP2]
++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
+- vzip.u8 xreg1, xreg3
+- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
+- vzip.u8 xreg2, xreg4
+- vld1.32 {yacc2lo[1]}, [TMP1]
+- vzip.u8 xreg3, xreg4
+- vld1.32 {yacc2hi[1]}, [TMP2]
+- vzip.u8 xreg1, xreg2
+- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+- vmull.u8 xacc1, xreg1, d28
+- vzip.u8 yreg1, yreg3
+- vmlal.u8 xacc1, xreg2, d29
+- vzip.u8 yreg2, yreg4
+- vmull.u8 xacc2, xreg3, d28
+- vzip.u8 yreg3, yreg4
+- vmlal.u8 xacc2, xreg4, d29
+- vzip.u8 yreg1, yreg2
+- vmull.u8 yacc1, yreg1, d28
+- vmlal.u8 yacc1, yreg2, d29
+- vmull.u8 yacc2, yreg3, d28
+- vmlal.u8 yacc2, yreg4, d29
++ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
++ vzip.u8 \xreg1, \xreg3
++ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
++ vzip.u8 \xreg2, \xreg4
++ vld1.32 {\yacc2lo[1]}, [TMP1]
++ vzip.u8 \xreg3, \xreg4
++ vld1.32 {\yacc2hi[1]}, [TMP2]
++ vzip.u8 \xreg1, \xreg2
++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
++ vmull.u8 \xacc1, \xreg1, d28
++ vzip.u8 \yreg1, \yreg3
++ vmlal.u8 \xacc1, \xreg2, d29
++ vzip.u8 \yreg2, \yreg4
++ vmull.u8 \xacc2, \xreg3, d28
++ vzip.u8 \yreg3, \yreg4
++ vmlal.u8 \xacc2, \xreg4, d29
++ vzip.u8 \yreg1, \yreg2
++ vmull.u8 \yacc1, \yreg1, d28
++ vmlal.u8 \yacc1, \yreg2, d29
++ vmull.u8 \yacc2, \yreg3, d28
++ vmlal.u8 \yacc2, \yreg4, d29
+ .endm
+
+ .macro bilinear_store_8888 numpix, tmp1, tmp2
+-.if numpix == 4
++.if \numpix == 4
+ vst1.32 {d0, d1}, [OUT]!
+-.elseif numpix == 2
++.elseif \numpix == 2
+ vst1.32 {d0}, [OUT]!
+-.elseif numpix == 1
++.elseif \numpix == 1
+ vst1.32 {d0[0]}, [OUT, :32]!
+ .else
+ .error bilinear_store_8888 numpix is unsupported
+ .endif
+ .endm
+
+ .macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ vuzp.u8 d1, d3
+ vuzp.u8 d0, d2
+- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+-.if numpix == 4
++ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
++.if \numpix == 4
+ vst1.16 {d2}, [OUT]!
+-.elseif numpix == 2
++.elseif \numpix == 2
+ vst1.32 {d2[0]}, [OUT]!
+-.elseif numpix == 1
++.elseif \numpix == 1
+ vst1.16 {d2[0]}, [OUT]!
+ .else
+ .error bilinear_store_0565 numpix is unsupported
+ .endif
+ .endm
+
+
+ /*
+ * Macros for loading mask pixels into register 'mask'.
+ * vdup must be done in somewhere else.
+ */
+ .macro bilinear_load_mask_x numpix, mask
+ .endm
+
+ .macro bilinear_load_mask_8 numpix, mask
+-.if numpix == 4
+- vld1.32 {mask[0]}, [MASK]!
+-.elseif numpix == 2
+- vld1.16 {mask[0]}, [MASK]!
+-.elseif numpix == 1
+- vld1.8 {mask[0]}, [MASK]!
++.if \numpix == 4
++ vld1.32 {\mask[0]}, [MASK]!
++.elseif \numpix == 2
++ vld1.16 {\mask[0]}, [MASK]!
++.elseif \numpix == 1
++ vld1.8 {\mask[0]}, [MASK]!
+ .else
+- .error bilinear_load_mask_8 numpix is unsupported
++ .error bilinear_load_mask_8 \numpix is unsupported
+ .endif
+ pld [MASK, #prefetch_offset]
+ .endm
+
+ .macro bilinear_load_mask mask_fmt, numpix, mask
+- bilinear_load_mask_&mask_fmt numpix, mask
++ bilinear_load_mask_\()\mask_fmt \numpix, \mask
+ .endm
+
+
+ /*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+ .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+ .endm
+
+ .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+ .endm
+
+ .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+-.if numpix == 4
+- vld1.32 {dst0, dst1}, [OUT]
+-.elseif numpix == 2
+- vld1.32 {dst0}, [OUT]
+-.elseif numpix == 1
+- vld1.32 {dst0[0]}, [OUT]
++.if \numpix == 4
++ vld1.32 {\dst0, \dst1}, [OUT]
++.elseif \numpix == 2
++ vld1.32 {\dst0}, [OUT]
++.elseif \numpix == 1
++ vld1.32 {\dst0[0]}, [OUT]
+ .else
+- .error bilinear_load_dst_8888 numpix is unsupported
++ .error bilinear_load_dst_8888 \numpix is unsupported
+ .endif
+ pld [OUT, #(prefetch_offset * 4)]
+ .endm
+
+ .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
++ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
++ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ /*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+@@ -285,79 +285,79 @@
+ * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+ .macro bilinear_duplicate_mask_x numpix, mask
+ .endm
+
+ .macro bilinear_duplicate_mask_8 numpix, mask
+-.if numpix == 4
+- vdup.32 mask, mask[0]
+-.elseif numpix == 2
+- vdup.16 mask, mask[0]
+-.elseif numpix == 1
+- vdup.8 mask, mask[0]
++.if \numpix == 4
++ vdup.32 \mask, \mask[0]
++.elseif \numpix == 2
++ vdup.16 \mask, \mask[0]
++.elseif \numpix == 1
++ vdup.8 \mask, \mask[0]
+ .else
+ .error bilinear_duplicate_mask_8 is unsupported
+ .endif
+ .endm
+
+ .macro bilinear_duplicate_mask mask_fmt, numpix, mask
+- bilinear_duplicate_mask_&mask_fmt numpix, mask
++ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
+ .endm
+
+ /*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+ .macro bilinear_interleave src0, src1, dst0, dst1
+- vuzp.8 src0, src1
+- vuzp.8 dst0, dst1
+- vuzp.8 src0, src1
+- vuzp.8 dst0, dst1
++ vuzp.8 \src0, \src1
++ vuzp.8 \dst0, \dst1
++ vuzp.8 \src0, \src1
++ vuzp.8 \dst0, \dst1
+ .endm
+
+ .macro bilinear_interleave_src_dst_x_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+ .endm
+
+ .macro bilinear_interleave_src_dst_x_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+- bilinear_interleave src0, src1, dst0, dst1
++ bilinear_interleave \src0, \src1, \dst0, \dst1
+ .endm
+
+ .macro bilinear_interleave_src_dst_x_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+ .endm
+
+ .macro bilinear_interleave_src_dst_8_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+- bilinear_interleave src0, src1, dst0, dst1
++ bilinear_interleave \src0, \src1, \dst0, \dst1
+ .endm
+
+ .macro bilinear_interleave_src_dst_8_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+- bilinear_interleave src0, src1, dst0, dst1
++ bilinear_interleave \src0, \src1, \dst0, \dst1
+ .endm
+
+ .macro bilinear_interleave_src_dst_8_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+- bilinear_interleave src0, src1, dst0, dst1
++ bilinear_interleave \src0, \src1, \dst0, \dst1
+ .endm
+
+ .macro bilinear_interleave_src_dst \
+ mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+- bilinear_interleave_src_dst_&mask_fmt&_&op \
+- numpix, src0, src1, src01, dst0, dst1, dst01
++ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
+ .endm
+
+
+ /*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+@@ -365,217 +365,217 @@
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+ .endm
+
+ .macro bilinear_apply_mask_to_src_8 \
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+
+- vmull.u8 tmp01, src0, mask
+- vmull.u8 tmp23, src1, mask
++ vmull.u8 \tmp01, \src0, \mask
++ vmull.u8 \tmp23, \src1, \mask
+ /* bubbles */
+- vrshr.u16 tmp45, tmp01, #8
+- vrshr.u16 tmp67, tmp23, #8
++ vrshr.u16 \tmp45, \tmp01, #8
++ vrshr.u16 \tmp67, \tmp23, #8
+ /* bubbles */
+- vraddhn.u16 src0, tmp45, tmp01
+- vraddhn.u16 src1, tmp67, tmp23
++ vraddhn.u16 \src0, \tmp45, \tmp01
++ vraddhn.u16 \src1, \tmp67, \tmp23
+ .endm
+
+ .macro bilinear_apply_mask_to_src \
+ mask_fmt, numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+
+- bilinear_apply_mask_to_src_&mask_fmt \
+- numpix, src0, src1, src01, mask, \
+- tmp01, tmp23, tmp45, tmp67
++ bilinear_apply_mask_to_src_\()\mask_fmt \
++ \numpix, \src0, \src1, \src01, \mask, \
++ \tmp01, \tmp23, \tmp45, \tmp67
+ .endm
+
+
+ /*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+ .macro bilinear_combine_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+ .endm
+
+ .macro bilinear_combine_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+- vdup.32 tmp8, src1[1]
++ vdup.32 \tmp8, \src1[1]
+ /* bubbles */
+- vmvn.8 tmp8, tmp8
++ vmvn.8 \tmp8, \tmp8
+ /* bubbles */
+- vmull.u8 tmp01, dst0, tmp8
++ vmull.u8 \tmp01, \dst0, \tmp8
+ /* bubbles */
+- vmull.u8 tmp23, dst1, tmp8
++ vmull.u8 \tmp23, \dst1, \tmp8
+ /* bubbles */
+- vrshr.u16 tmp45, tmp01, #8
+- vrshr.u16 tmp67, tmp23, #8
++ vrshr.u16 \tmp45, \tmp01, #8
++ vrshr.u16 \tmp67, \tmp23, #8
+ /* bubbles */
+- vraddhn.u16 dst0, tmp45, tmp01
+- vraddhn.u16 dst1, tmp67, tmp23
++ vraddhn.u16 \dst0, \tmp45, \tmp01
++ vraddhn.u16 \dst1, \tmp67, \tmp23
+ /* bubbles */
+- vqadd.u8 src01, dst01, src01
++ vqadd.u8 \src01, \dst01, \src01
+ .endm
+
+ .macro bilinear_combine_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+- vqadd.u8 src01, dst01, src01
++ vqadd.u8 \src01, \dst01, \src01
+ .endm
+
+ .macro bilinear_combine \
+ op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+- bilinear_combine_&op \
+- numpix, src0, src1, src01, dst0, dst1, dst01, \
+- tmp01, tmp23, tmp45, tmp67, tmp8
++ bilinear_combine_\()\op \
++ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
++ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
+ .endm
+
+ /*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+ .macro bilinear_deinterleave numpix, dst0, dst1, dst01
+- vuzp.8 dst0, dst1
++ vuzp.8 \dst0, \dst1
+ /* bubbles */
+- vuzp.8 dst0, dst1
++ vuzp.8 \dst0, \dst1
+ .endm
+
+ .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+ .endm
+
+ .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+- bilinear_deinterleave numpix, dst0, dst1, dst01
++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+ .endm
+
+ .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+- bilinear_deinterleave numpix, dst0, dst1, dst01
++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+- bilinear_deinterleave numpix, dst0, dst1, dst01
++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+- bilinear_deinterleave numpix, dst0, dst1, dst01
++ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
+ .endm
+
+ .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
++ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
+ .endm
+
+
+ .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+- bilinear_load_&src_fmt d0, d1, d2
+- bilinear_load_mask mask_fmt, 1, d4
+- bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
++ bilinear_load_\()\src_fmt d0, d1, d2
++ bilinear_load_mask \mask_fmt, 1, d4
++ bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ /* 5 cycles bubble */
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+- bilinear_duplicate_mask mask_fmt, 1, d4
++ bilinear_duplicate_mask \mask_fmt, 1, d4
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ bilinear_interleave_src_dst \
+- mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
++ \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
+ bilinear_apply_mask_to_src \
+- mask_fmt, 1, d0, d1, q0, d4, \
++ \mask_fmt, 1, d0, d1, q0, d4, \
+ q3, q8, q10, q11
+ bilinear_combine \
+- op, 1, d0, d1, q0, d18, d19, q9, \
++ \op, 1, d0, d1, q0, d18, d19, q9, \
+ q3, q8, q10, q11, d5
+- bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
+- bilinear_store_&dst_fmt 1, q2, q3
++ bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
++ bilinear_store_\()\dst_fmt 1, q2, q3
+ .endm
+
+ .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+- bilinear_load_and_vertical_interpolate_two_&src_fmt \
++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23
+- bilinear_load_mask mask_fmt, 2, d4
+- bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
++ bilinear_load_mask \mask_fmt, 2, d4
++ bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+- bilinear_duplicate_mask mask_fmt, 2, d4
++ bilinear_duplicate_mask \mask_fmt, 2, d4
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d0, q0
+ bilinear_interleave_src_dst \
+- mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
++ \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
+ bilinear_apply_mask_to_src \
+- mask_fmt, 2, d0, d1, q0, d4, \
++ \mask_fmt, 2, d0, d1, q0, d4, \
+ q3, q8, q10, q11
+ bilinear_combine \
+- op, 2, d0, d1, q0, d18, d19, q9, \
++ \op, 2, d0, d1, q0, d18, d19, q9, \
+ q3, q8, q10, q11, d5
+- bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
+- bilinear_store_&dst_fmt 2, q2, q3
++ bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
++ bilinear_store_\()\dst_fmt 2, q2, q3
+ .endm
+
+ .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+- bilinear_load_and_vertical_interpolate_four_&src_fmt \
++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23 \
+ q3, q9, d4, d5, d16, d17, d18, d19
+ pld [TMP1, PF_OFFS]
+ sub TMP1, TMP1, STRIDE
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
+- bilinear_load_mask mask_fmt, 4, d22
+- bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
++ bilinear_load_mask \mask_fmt, 4, d22
++ bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
+ pld [TMP1, PF_OFFS]
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vadd.u16 q12, q12, q13
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+- bilinear_duplicate_mask mask_fmt, 4, d22
++ bilinear_duplicate_mask \mask_fmt, 4, d22
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vadd.u16 q12, q12, q13
+ bilinear_interleave_src_dst \
+- mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
++ \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
+ bilinear_apply_mask_to_src \
+- mask_fmt, 4, d0, d1, q0, d22, \
++ \mask_fmt, 4, d0, d1, q0, d22, \
+ q3, q8, q9, q10
+ bilinear_combine \
+- op, 4, d0, d1, q0, d2, d3, q1, \
++ \op, 4, d0, d1, q0, d2, d3, q1, \
+ q3, q8, q9, q10, d23
+- bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
+- bilinear_store_&dst_fmt 4, q2, q3
++ bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
++ bilinear_store_\()\dst_fmt 4, q2, q3
+ .endm
+
+ .set BILINEAR_FLAG_USE_MASK, 1
+ .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+ /*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+@@ -605,24 +605,24 @@
+ bilinear_process_four_pixels, \
+ bilinear_process_pixblock_head, \
+ bilinear_process_pixblock_tail, \
+ bilinear_process_pixblock_tail_head, \
+ pixblock_size, \
+ prefetch_distance, \
+ flags
+
+-pixman_asm_function fname
+-.if pixblock_size == 8
+-.elseif pixblock_size == 4
++pixman_asm_function \fname
++.if \pixblock_size == 8
++.elseif \pixblock_size == 4
+ .else
+ .error unsupported pixblock size
+ .endif
+
+-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+@@ -630,17 +630,17 @@ pixman_asm_function fname
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+ STRIDE .req r2
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+- mov PF_OFFS, #prefetch_distance
++ mov PF_OFFS, #\prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+ .else
+ OUT .req r0
+ MASK .req r1
+ TOP .req r2
+ BOTTOM .req r3
+ WT .req r4
+ WB .req r5
+@@ -649,27 +649,27 @@ pixman_asm_function fname
+ WIDTH .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ PF_OFFS .req r8
+ TMP3 .req r9
+ TMP4 .req r10
+ STRIDE .req r3
+
+- .set prefetch_offset, prefetch_distance
++ .set prefetch_offset, \prefetch_distance
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9, r10, ip}
+- mov PF_OFFS, #prefetch_distance
++ mov PF_OFFS, #\prefetch_distance
+ ldmia ip, {WT, WB, X, UX, WIDTH}
+ .endif
+
+ mul PF_OFFS, PF_OFFS, UX
+
+-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+ .endif
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 3f
+@@ -678,76 +678,76 @@ pixman_asm_function fname
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 0f
+- tst OUT, #(1 << dst_bpp_shift)
++ tst OUT, #(1 << \dst_bpp_shift)
+ beq 0f
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+- bilinear_process_last_pixel
++ \bilinear_process_last_pixel
+ sub WIDTH, WIDTH, #1
+ 0:
+ vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+
+ cmp WIDTH, #2
+ blt 0f
+- tst OUT, #(1 << (dst_bpp_shift + 1))
++ tst OUT, #(1 << (\dst_bpp_shift + 1))
+ beq 0f
+- bilinear_process_two_pixels
++ \bilinear_process_two_pixels
+ sub WIDTH, WIDTH, #2
+ 0:
+-.if pixblock_size == 8
++.if \pixblock_size == 8
+ cmp WIDTH, #4
+ blt 0f
+- tst OUT, #(1 << (dst_bpp_shift + 2))
++ tst OUT, #(1 << (\dst_bpp_shift + 2))
+ beq 0f
+- bilinear_process_four_pixels
++ \bilinear_process_four_pixels
+ sub WIDTH, WIDTH, #4
+ 0:
+ .endif
+- subs WIDTH, WIDTH, #pixblock_size
++ subs WIDTH, WIDTH, #\pixblock_size
+ blt 1f
+- mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+- bilinear_process_pixblock_head
+- subs WIDTH, WIDTH, #pixblock_size
++ mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
++ \bilinear_process_pixblock_head
++ subs WIDTH, WIDTH, #\pixblock_size
+ blt 5f
+ 0:
+- bilinear_process_pixblock_tail_head
+- subs WIDTH, WIDTH, #pixblock_size
++ \bilinear_process_pixblock_tail_head
++ subs WIDTH, WIDTH, #\pixblock_size
+ bge 0b
+ 5:
+- bilinear_process_pixblock_tail
++ \bilinear_process_pixblock_tail
+ 1:
+-.if pixblock_size == 8
++.if \pixblock_size == 8
+ tst WIDTH, #4
+ beq 2f
+- bilinear_process_four_pixels
++ \bilinear_process_four_pixels
+ 2:
+ .endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 2f
+- bilinear_process_two_pixels
++ \bilinear_process_two_pixels
+ 2:
+ tst WIDTH, #1
+ beq 3f
+- bilinear_process_last_pixel
++ \bilinear_process_last_pixel
+ 3:
+-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+ .endif
+
+-.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
++.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
+ pop {r4, r5, r6, r7, r8, r9}
+ .else
+ pop {r4, r5, r6, r7, r8, r9, r10, ip}
+ .endif
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+@@ -757,21 +757,21 @@ 3:
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+-.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
++.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
+ .unreq MASK
+ .endif
+
+-.endfunc
++pixman_end_asm_function
+
+ .endm
+
+ /* src_8888_8_8888 */
+ .macro bilinear_src_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
+ .endm
+
+diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
+--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
++++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
+@@ -29,16 +29,22 @@
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ * - pixman_composite_over_8888_0565_asm_neon
+ * - pixman_composite_over_n_8_0565_asm_neon
+ */
+
++#ifdef __clang__
++#define ldrgeb ldrbge
++#define subges subsge
++#define subpls subspl
++#endif
++
+ /* Prevent the stack from becoming executable for no reason... */
+ #if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+ #endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+@@ -255,43 +261,43 @@
+ vqadd.u8 d16, d2, d20
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vqadd.u8 q9, q0, q11
+ vshrn.u16 d6, q2, #8
+ fetch_src_pixblock
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vshll.u8 q14, d16, #8
+- PF add PF_X, PF_X, #8
++ PF add, PF_X, PF_X, #8
+ vshll.u8 q8, d19, #8
+- PF tst PF_CTL, #0xF
++ PF tst, PF_CTL, #0xF
+ vsri.u8 d6, d6, #5
+- PF addne PF_X, PF_X, #8
++ PF addne, PF_X, PF_X, #8
+ vmvn.8 d3, d3
+- PF subne PF_CTL, PF_CTL, #1
++ PF subne, PF_CTL, PF_CTL, #1
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q10, d3, d6
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vsri.u16 q14, q8, #5
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vshll.u8 q9, d18, #8
+ vrshr.u16 q13, q10, #8
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vsri.u16 q14, q9, #11
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vraddhn.u16 d22, q12, q15
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ .endm
+
+ #else
+
+ /* If we did not care much about the performance, we would just use this... */
+ .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+@@ -429,30 +435,30 @@ generate_composite_function \
+
+ .macro pixman_composite_src_8888_0565_process_pixblock_tail
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+ .endm
+
+ .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+ vsri.u16 q14, q8, #5
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
+ fetch_src_pixblock
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vsri.u16 q14, q9, #11
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+- PF subge PF_X, PF_X, ORIG_W
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subge, PF_X, PF_X, ORIG_W
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vshll.u8 q14, d2, #8
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vshll.u8 q9, d0, #8
+ .endm
+
+ generate_composite_function \
+ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+@@ -504,30 +510,30 @@ generate_composite_function \
+ vqadd.u8 q15, q1, q3
+ .endm
+
+ .macro pixman_composite_add_8_8_process_pixblock_tail
+ .endm
+
+ .macro pixman_composite_add_8_8_process_pixblock_tail_head
+ fetch_src_pixblock
+- PF add PF_X, PF_X, #32
+- PF tst PF_CTL, #0xF
++ PF add, PF_X, PF_X, #32
++ PF tst, PF_CTL, #0xF
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+- PF addne PF_X, PF_X, #32
+- PF subne PF_CTL, PF_CTL, #1
++ PF addne, PF_X, PF_X, #32
++ PF subne, PF_CTL, PF_CTL, #1
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+- PF subge PF_X, PF_X, ORIG_W
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subge, PF_X, PF_X, ORIG_W
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+ .endm
+
+ generate_composite_function \
+ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+@@ -536,30 +542,30 @@ generate_composite_function \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_tail_head
+
+ /******************************************************************************/
+
+ .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
+ vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+- PF subge PF_X, PF_X, ORIG_W
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subge, PF_X, PF_X, ORIG_W
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+ .endm
+
+ generate_composite_function \
+ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+@@ -599,40 +605,40 @@ generate_composite_function_single_scanl
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ .endm
+
+ .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subsge, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+ .endm
+
+ generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+@@ -651,42 +657,42 @@ generate_composite_function_single_scanl
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ .endm
+
+ .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+ .endm
+
+ generate_composite_function \
+ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+@@ -737,30 +743,30 @@ generate_composite_function_single_scanl
+ vrshr.u16 q2, q10, #8
+ vrshr.u16 q3, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q2, q10
+ vraddhn.u16 d31, q3, q11
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vqadd.u8 q14, q0, q14
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0x0F
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0x0F
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vqadd.u8 q15, q1, q15
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vmull.u8 q8, d24, d4
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vmull.u8 q9, d24, d5
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d6
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d7
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ .endm
+
+ .macro pixman_composite_over_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+@@ -779,40 +785,40 @@ generate_composite_function \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8888_process_pixblock_tail_head
+
+ /******************************************************************************/
+
+ .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ vmull.u8 q10, d22, d6
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+ .endm
+
+ .macro pixman_composite_over_reverse_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d7[0]}, [DUMMY]
+ vdup.8 d4, d7[0]
+ vdup.8 d5, d7[1]
+@@ -1240,33 +1246,33 @@ generate_composite_function \
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+ .endm
+
+ .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+- PF add PF_X, PF_X, #8
++ PF add, PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+- PF tst PF_CTL, #0x0F
++ PF tst, PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+- PF addne PF_X, PF_X, #8
++ PF addne, PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF subne, PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ .endm
+
+ .macro pixman_composite_src_n_8_8888_init
+@@ -1309,33 +1315,33 @@ generate_composite_function \
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+ .endm
+
+ .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+- PF add PF_X, PF_X, #8
++ PF add, PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+- PF tst PF_CTL, #0x0F
++ PF tst, PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+- PF addne PF_X, PF_X, #8
++ PF addne, PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF subne, PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+ .endm
+
+ .macro pixman_composite_src_n_8_8_init
+@@ -1403,37 +1409,37 @@ generate_composite_function \
+ .endm
+
+ .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q15, q9, #8
+ fetch_mask_pixblock
+ vrshr.u16 q6, q10, #8
+- PF add PF_X, PF_X, #8
++ PF add, PF_X, PF_X, #8
+ vrshr.u16 q7, q11, #8
+- PF tst PF_CTL, #0x0F
++ PF tst, PF_CTL, #0x0F
+ vraddhn.u16 d28, q14, q8
+- PF addne PF_X, PF_X, #8
++ PF addne, PF_X, PF_X, #8
+ vraddhn.u16 d29, q15, q9
+- PF subne PF_CTL, PF_CTL, #1
++ PF subne, PF_CTL, PF_CTL, #1
+ vraddhn.u16 d30, q6, q10
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ vraddhn.u16 d31, q7, q11
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vmull.u8 q6, d24, d8
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q7, d24, d9
+- PF subge PF_X, PF_X, ORIG_W
++ PF subge, PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d24, d10
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subges, PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d24, d11
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q14, q0, q14
+- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vqadd.u8 q15, q1, q15
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vrshr.u16 q12, q8, #8
+ vrshr.u16 q13, q9, #8
+ vraddhn.u16 d0, q6, q10
+ vraddhn.u16 d1, q7, q11
+ vraddhn.u16 d2, q8, q12
+@@ -2420,31 +2426,31 @@ generate_composite_function \
+
+ .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d30, q11, q8
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+- PF subge PF_X, PF_X, ORIG_W
+- PF subges PF_CTL, PF_CTL, #0x10
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF subge, PF_X, PF_X, ORIG_W
++ PF subges, PF_CTL, PF_CTL, #0x10
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ .endm
+
+ generate_composite_function \
+ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+@@ -2477,31 +2483,31 @@ generate_composite_function \
+
+ .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d28, q11, q8
+- PF add PF_X, PF_X, #8
+- PF tst PF_CTL, #0xF
+- PF addne PF_X, PF_X, #8
+- PF subne PF_CTL, PF_CTL, #1
++ PF add, PF_X, PF_X, #8
++ PF tst, PF_CTL, #0xF
++ PF addne, PF_X, PF_X, #8
++ PF subne, PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+- PF cmp PF_X, ORIG_W
++ PF cmp, PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+- PF subge PF_X, PF_X, ORIG_W
+- PF subges PF_CTL, PF_CTL, #0x10
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF subge, PF_X, PF_X, ORIG_W
++ PF subges, PF_CTL, PF_CTL, #0x10
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ .endm
+
+ generate_composite_function \
+ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+@@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+ .macro bilinear_load_8888 reg1, reg2, tmp
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+- vld1.32 {reg1}, [TMP1], STRIDE
+- vld1.32 {reg2}, [TMP1]
++ vld1.32 {\reg1}, [TMP1], STRIDE
++ vld1.32 {\reg2}, [TMP1]
+ .endm
+
+ .macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+- vld1.32 {reg2[0]}, [TMP1], STRIDE
+- vld1.32 {reg2[1]}, [TMP1]
+- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
++ vld1.32 {\reg2[0]}, [TMP1], STRIDE
++ vld1.32 {\reg2[1]}, [TMP1]
++ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+- bilinear_load_8888 reg1, reg2, tmp1
+- vmull.u8 acc1, reg1, d28
+- vmlal.u8 acc1, reg2, d29
+- bilinear_load_8888 reg3, reg4, tmp2
+- vmull.u8 acc2, reg3, d28
+- vmlal.u8 acc2, reg4, d29
++ bilinear_load_8888 \reg1, \reg2, \tmp1
++ vmull.u8 \acc1, \reg1, d28
++ vmlal.u8 \acc1, \reg2, d29
++ bilinear_load_8888 \reg3, \reg4, \tmp2
++ vmull.u8 \acc2, \reg3, d28
++ vmlal.u8 \acc2, \reg4, d29
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
++ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
++ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+- vld1.32 {acc2lo[0]}, [TMP1], STRIDE
+- vld1.32 {acc2hi[0]}, [TMP2], STRIDE
+- vld1.32 {acc2lo[1]}, [TMP1]
+- vld1.32 {acc2hi[1]}, [TMP2]
+- convert_0565_to_x888 acc2, reg3, reg2, reg1
+- vzip.u8 reg1, reg3
+- vzip.u8 reg2, reg4
+- vzip.u8 reg3, reg4
+- vzip.u8 reg1, reg2
+- vmull.u8 acc1, reg1, d28
+- vmlal.u8 acc1, reg2, d29
+- vmull.u8 acc2, reg3, d28
+- vmlal.u8 acc2, reg4, d29
++ vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
++ vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
++ vld1.32 {\acc2lo[1]}, [TMP1]
++ vld1.32 {\acc2hi[1]}, [TMP2]
++ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
++ vzip.u8 \reg1, \reg3
++ vzip.u8 \reg2, \reg4
++ vzip.u8 \reg3, \reg4
++ vzip.u8 \reg1, \reg2
++ vmull.u8 \acc1, \reg1, d28
++ vmlal.u8 \acc1, \reg2, d29
++ vmull.u8 \acc2, \reg3, d28
++ vmlal.u8 \acc2, \reg4, d29
+ .endm
+
+ .macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+- vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
+- vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
+- vld1.32 {xacc2lo[1]}, [TMP1]
+- vld1.32 {xacc2hi[1]}, [TMP2]
+- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
++ vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
++ vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
++ vld1.32 {\xacc2lo[1]}, [TMP1]
++ vld1.32 {\xacc2hi[1]}, [TMP2]
++ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+- vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
+- vzip.u8 xreg1, xreg3
+- vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
+- vzip.u8 xreg2, xreg4
+- vld1.32 {yacc2lo[1]}, [TMP1]
+- vzip.u8 xreg3, xreg4
+- vld1.32 {yacc2hi[1]}, [TMP2]
+- vzip.u8 xreg1, xreg2
+- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+- vmull.u8 xacc1, xreg1, d28
+- vzip.u8 yreg1, yreg3
+- vmlal.u8 xacc1, xreg2, d29
+- vzip.u8 yreg2, yreg4
+- vmull.u8 xacc2, xreg3, d28
+- vzip.u8 yreg3, yreg4
+- vmlal.u8 xacc2, xreg4, d29
+- vzip.u8 yreg1, yreg2
+- vmull.u8 yacc1, yreg1, d28
+- vmlal.u8 yacc1, yreg2, d29
+- vmull.u8 yacc2, yreg3, d28
+- vmlal.u8 yacc2, yreg4, d29
++ vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
++ vzip.u8 \xreg1, \xreg3
++ vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
++ vzip.u8 \xreg2, \xreg4
++ vld1.32 {\yacc2lo[1]}, [TMP1]
++ vzip.u8 \xreg3, \xreg4
++ vld1.32 {\yacc2hi[1]}, [TMP2]
++ vzip.u8 \xreg1, \xreg2
++ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
++ vmull.u8 \xacc1, \xreg1, d28
++ vzip.u8 \yreg1, \yreg3
++ vmlal.u8 \xacc1, \xreg2, d29
++ vzip.u8 \yreg2, \yreg4
++ vmull.u8 \xacc2, \xreg3, d28
++ vzip.u8 \yreg3, \yreg4
++ vmlal.u8 \xacc2, \xreg4, d29
++ vzip.u8 \yreg1, \yreg2
++ vmull.u8 \yacc1, \yreg1, d28
++ vmlal.u8 \yacc1, \yreg2, d29
++ vmull.u8 \yacc2, \yreg3, d28
++ vmlal.u8 \yacc2, \yreg4, d29
+ .endm
+
+ .macro bilinear_store_8888 numpix, tmp1, tmp2
+-.if numpix == 4
++.if \numpix == 4
+ vst1.32 {d0, d1}, [OUT, :128]!
+-.elseif numpix == 2
++.elseif \numpix == 2
+ vst1.32 {d0}, [OUT, :64]!
+-.elseif numpix == 1
++.elseif \numpix == 1
+ vst1.32 {d0[0]}, [OUT, :32]!
+ .else
+- .error bilinear_store_8888 numpix is unsupported
++ .error bilinear_store_8888 \numpix is unsupported
+ .endif
+ .endm
+
+ .macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ vuzp.u8 d1, d3
+ vuzp.u8 d0, d2
+- convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+-.if numpix == 4
++ convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
++.if \numpix == 4
+ vst1.16 {d2}, [OUT, :64]!
+-.elseif numpix == 2
++.elseif \numpix == 2
+ vst1.32 {d2[0]}, [OUT, :32]!
+-.elseif numpix == 1
++.elseif \numpix == 1
+ vst1.16 {d2[0]}, [OUT, :16]!
+ .else
+- .error bilinear_store_0565 numpix is unsupported
++ .error bilinear_store_0565 \numpix is unsupported
+ .endif
+ .endm
+
+ .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+- bilinear_load_&src_fmt d0, d1, d2
++ bilinear_load_\()\src_fmt d0, d1, d2
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ /* 5 cycles bubble */
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+- bilinear_store_&dst_fmt 1, q2, q3
++ bilinear_store_\()\dst_fmt 1, q2, q3
+ .endm
+
+ .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+- bilinear_load_and_vertical_interpolate_two_&src_fmt \
++ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d0, q0
+- bilinear_store_&dst_fmt 2, q2, q3
++ bilinear_store_\()\dst_fmt 2, q2, q3
+ .endm
+
+ .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+- bilinear_load_and_vertical_interpolate_four_&src_fmt \
++ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23 \
+ q3, q9, d4, d5, d16, d17, d18, d19
+ pld [TMP1, PF_OFFS]
+ sub TMP1, TMP1, STRIDE
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+@@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vadd.u16 q12, q12, q13
+- bilinear_store_&dst_fmt 4, q2, q3
++ bilinear_store_\()\dst_fmt 4, q2, q3
+ .endm
+
+ .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
+ .else
+- bilinear_interpolate_four_pixels src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+ .endif
+ .endm
+
+ .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
+ .endif
+ .endm
+
+ .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+-.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
++.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
++ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
+ .else
+- bilinear_interpolate_four_pixels src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+ .endif
+ .endm
+
+ .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
+ .else
+- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ .endif
+ .endm
+
+ .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
+ .else
+- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
+ .endif
+ .endm
+
+ .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+-.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
++.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
++ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
+ .else
+- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ .endif
+ .endm
+
+ .set BILINEAR_FLAG_UNROLL_4, 0
+ .set BILINEAR_FLAG_UNROLL_8, 1
+ .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+ /*
+@@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ */
+
+ .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+ src_bpp_shift, dst_bpp_shift, \
+ prefetch_distance, flags
+
+-pixman_asm_function fname
++pixman_asm_function \fname
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+@@ -3119,21 +3125,21 @@ pixman_asm_function fname
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+ STRIDE .req r2
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+- mov PF_OFFS, #prefetch_distance
++ mov PF_OFFS, #\prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+ mul PF_OFFS, PF_OFFS, UX
+
+-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+ .endif
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 3f
+@@ -3146,83 +3152,83 @@ pixman_asm_function fname
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 0f
+ tst OUT, #(1 << dst_bpp_shift)
+ beq 0f
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+- bilinear_interpolate_last_pixel src_fmt, dst_fmt
++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
+ sub WIDTH, WIDTH, #1
+ 0:
+ vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+
+ cmp WIDTH, #2
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 1))
+ beq 0f
+- bilinear_interpolate_two_pixels src_fmt, dst_fmt
++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
+ sub WIDTH, WIDTH, #2
+ 0:
+-.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
++.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
+ /*********** 8 pixels per iteration *****************/
+ cmp WIDTH, #4
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 0f
+- bilinear_interpolate_four_pixels src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+ sub WIDTH, WIDTH, #4
+ 0:
+ subs WIDTH, WIDTH, #8
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
++ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #8
+ blt 5f
+ 0:
+- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
++ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #8
+ bge 0b
+ 5:
+- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
++ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
+ 1:
+ tst WIDTH, #4
+ beq 2f
+- bilinear_interpolate_four_pixels src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
+ 2:
+ .else
+ /*********** 4 pixels per iteration *****************/
+ subs WIDTH, WIDTH, #4
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #4
+ blt 5f
+ 0:
+- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
+ subs WIDTH, WIDTH, #4
+ bge 0b
+ 5:
+- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
++ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
+ 1:
+ /****************************************************/
+ .endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 2f
+- bilinear_interpolate_two_pixels src_fmt, dst_fmt
++ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
+ 2:
+ tst WIDTH, #1
+ beq 3f
+- bilinear_interpolate_last_pixel src_fmt, dst_fmt
++ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
+ 3:
+-.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
++.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+ .endif
+ pop {r4, r5, r6, r7, r8, r9}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+@@ -3231,17 +3237,17 @@ 3:
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+-.endfunc
++pixman_end_asm_function
+
+ .endm
+
+ /*****************************************************************************/
+
+ .set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+ .macro bilinear_interpolate_four_pixels_8888_8888_head
+diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
+--- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
++++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
+@@ -69,303 +69,303 @@
+ .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
+
+ /*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+ .macro pixldst1 op, elem_size, reg1, mem_operand, abits
+-.if abits > 0
+- op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
++.if \abits > 0
++ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
+ .else
+- op&.&elem_size {d&reg1}, [&mem_operand&]!
++ \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
+ .endif
+ .endm
+
+ .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+-.if abits > 0
+- op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
++.if \abits > 0
++ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
+ .else
+- op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
++ \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
+ .endif
+ .endm
+
+ .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+-.if abits > 0
+- op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
++.if \abits > 0
++ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
+ .else
+- op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
++ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
+ .endif
+ .endm
+
+ .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+- op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
++ \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
+ .endm
+
+ .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+- op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
++ \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
+ .endm
+
+ .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+- op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
++ \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
+ .endm
+
+ .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+-.if numbytes == 32
+- pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+- %(basereg+6), %(basereg+7), mem_operand, abits
+-.elseif numbytes == 16
+- pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+-.elseif numbytes == 8
+- pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+-.elseif numbytes == 4
+- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+- pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+- .elseif elem_size == 16
+- pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+- pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
++.if \numbytes == 32
++ pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
++.elseif \numbytes == 16
++ pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
++.elseif \numbytes == 8
++ pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
++.elseif \numbytes == 4
++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
++ pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
++ .elseif \elem_size == 16
++ pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
++ pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
+ .else
+- pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+- pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+- pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+- pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
++ pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
++ pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
++ pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
++ pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
+ .endif
+-.elseif numbytes == 2
+- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+- pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
++.elseif \numbytes == 2
++ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
++ pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
+ .else
+- pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+- pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
++ pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
++ pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
+ .endif
+-.elseif numbytes == 1
+- pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
++.elseif \numbytes == 1
++ pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
+ .else
+- .error "unsupported size: numbytes"
++ .error "unsupported size: \numbytes"
+ .endif
+ .endm
+
+ .macro pixld numpix, bpp, basereg, mem_operand, abits=0
+-.if bpp > 0
+-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+- pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+- %(basereg+6), %(basereg+7), mem_operand, abits
+-.elseif (bpp == 24) && (numpix == 8)
+- pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+-.elseif (bpp == 24) && (numpix == 4)
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+-.elseif (bpp == 24) && (numpix == 2)
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+-.elseif (bpp == 24) && (numpix == 1)
+- pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
++.if \bpp > 0
++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++ pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
++.elseif (\bpp == 24) && (\numpix == 8)
++ pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
++.elseif (\bpp == 24) && (\numpix == 4)
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
++.elseif (\bpp == 24) && (\numpix == 2)
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
++.elseif (\bpp == 24) && (\numpix == 1)
++ pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
+ .else
+- pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
++ pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
+ .endif
+ .endif
+ .endm
+
+ .macro pixst numpix, bpp, basereg, mem_operand, abits=0
+-.if bpp > 0
+-.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+- pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+- %(basereg+6), %(basereg+7), mem_operand, abits
+-.elseif (bpp == 24) && (numpix == 8)
+- pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+-.elseif (bpp == 24) && (numpix == 4)
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+-.elseif (bpp == 24) && (numpix == 2)
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+-.elseif (bpp == 24) && (numpix == 1)
+- pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
++.if \bpp > 0
++.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++ pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
++ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
++.elseif (\bpp == 24) && (\numpix == 8)
++ pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
++.elseif (\bpp == 24) && (\numpix == 4)
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
++.elseif (\bpp == 24) && (\numpix == 2)
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
++.elseif (\bpp == 24) && (\numpix == 1)
++ pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
+ .else
+- pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
++ pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
+ .endif
+ .endif
+ .endm
+
+ .macro pixld_a numpix, bpp, basereg, mem_operand
+-.if (bpp * numpix) <= 128
+- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
++.if (\bpp * \numpix) <= 128
++ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
+ .else
+- pixld numpix, bpp, basereg, mem_operand, 128
++ pixld \numpix, \bpp, \basereg, \mem_operand, 128
+ .endif
+ .endm
+
+ .macro pixst_a numpix, bpp, basereg, mem_operand
+-.if (bpp * numpix) <= 128
+- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
++.if (\bpp * \numpix) <= 128
++ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
+ .else
+- pixst numpix, bpp, basereg, mem_operand, 128
++ pixst \numpix, \bpp, \basereg, \mem_operand, 128
+ .endif
+ .endm
+
+ /*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+ .macro pixld1_s elem_size, reg1, mem_operand
+-.if elem_size == 16
++.if \elem_size == 16
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP1, mem_operand, TMP1, asl #1
++ add TMP1, \mem_operand, TMP1, asl #1
+ mov TMP2, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP2, mem_operand, TMP2, asl #1
+- vld1.16 {d&reg1&[0]}, [TMP1, :16]
++ add TMP2, \mem_operand, TMP2, asl #1
++ vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP1, mem_operand, TMP1, asl #1
+- vld1.16 {d&reg1&[1]}, [TMP2, :16]
++ add TMP1, \mem_operand, TMP1, asl #1
++ vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
+ mov TMP2, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP2, mem_operand, TMP2, asl #1
+- vld1.16 {d&reg1&[2]}, [TMP1, :16]
+- vld1.16 {d&reg1&[3]}, [TMP2, :16]
+-.elseif elem_size == 32
++ add TMP2, \mem_operand, TMP2, asl #1
++ vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
++ vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
++.elseif \elem_size == 32
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP1, mem_operand, TMP1, asl #2
++ add TMP1, \mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP2, mem_operand, TMP2, asl #2
+- vld1.32 {d&reg1&[0]}, [TMP1, :32]
+- vld1.32 {d&reg1&[1]}, [TMP2, :32]
++ add TMP2, \mem_operand, TMP2, asl #2
++ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
++ vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
+ .else
+ .error "unsupported"
+ .endif
+ .endm
+
+ .macro pixld2_s elem_size, reg1, reg2, mem_operand
+ .if 0 /* elem_size == 32 */
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X, asl #1
+- add TMP1, mem_operand, TMP1, asl #2
++ add TMP1, \mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ sub VX, VX, UNIT_X
+- add TMP2, mem_operand, TMP2, asl #2
+- vld1.32 {d&reg1&[0]}, [TMP1, :32]
++ add TMP2, \mem_operand, TMP2, asl #2
++ vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X, asl #1
+- add TMP1, mem_operand, TMP1, asl #2
+- vld1.32 {d&reg2&[0]}, [TMP2, :32]
++ add TMP1, \mem_operand, TMP1, asl #2
++ vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+- add TMP2, mem_operand, TMP2, asl #2
+- vld1.32 {d&reg1&[1]}, [TMP1, :32]
+- vld1.32 {d&reg2&[1]}, [TMP2, :32]
++ add TMP2, \mem_operand, TMP2, asl #2
++ vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
++ vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
+ .else
+- pixld1_s elem_size, reg1, mem_operand
+- pixld1_s elem_size, reg2, mem_operand
++ pixld1_s \elem_size, \reg1, \mem_operand
++ pixld1_s \elem_size, \reg2, \mem_operand
+ .endif
+ .endm
+
+ .macro pixld0_s elem_size, reg1, idx, mem_operand
+-.if elem_size == 16
++.if \elem_size == 16
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP1, mem_operand, TMP1, asl #1
+- vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+-.elseif elem_size == 32
++ add TMP1, \mem_operand, TMP1, asl #1
++ vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
++.elseif \elem_size == 32
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+- add TMP1, mem_operand, TMP1, asl #2
+- vld1.32 {d&reg1&[idx]}, [TMP1, :32]
++ add TMP1, \mem_operand, TMP1, asl #2
++ vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
+ .endif
+ .endm
+
+ .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+-.if numbytes == 32
+- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+- pixdeinterleave elem_size, %(basereg+4)
+-.elseif numbytes == 16
+- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+-.elseif numbytes == 8
+- pixld1_s elem_size, %(basereg+1), mem_operand
+-.elseif numbytes == 4
+- .if elem_size == 32
+- pixld0_s elem_size, %(basereg+0), 1, mem_operand
+- .elseif elem_size == 16
+- pixld0_s elem_size, %(basereg+0), 2, mem_operand
+- pixld0_s elem_size, %(basereg+0), 3, mem_operand
++.if \numbytes == 32
++ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
++ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
++ pixdeinterleave \elem_size, %(\basereg+4)
++.elseif \numbytes == 16
++ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
++.elseif \numbytes == 8
++ pixld1_s \elem_size, %(\basereg+1), \mem_operand
++.elseif \numbytes == 4
++ .if \elem_size == 32
++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
++ .elseif \elem_size == 16
++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
+ .else
+- pixld0_s elem_size, %(basereg+0), 4, mem_operand
+- pixld0_s elem_size, %(basereg+0), 5, mem_operand
+- pixld0_s elem_size, %(basereg+0), 6, mem_operand
+- pixld0_s elem_size, %(basereg+0), 7, mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
+ .endif
+-.elseif numbytes == 2
+- .if elem_size == 16
+- pixld0_s elem_size, %(basereg+0), 1, mem_operand
++.elseif \numbytes == 2
++ .if \elem_size == 16
++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+ .else
+- pixld0_s elem_size, %(basereg+0), 2, mem_operand
+- pixld0_s elem_size, %(basereg+0), 3, mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
++ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
+ .endif
+-.elseif numbytes == 1
+- pixld0_s elem_size, %(basereg+0), 1, mem_operand
++.elseif \numbytes == 1
++ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
+ .else
+- .error "unsupported size: numbytes"
++ .error "unsupported size: \numbytes"
+ .endif
+ .endm
+
+ .macro pixld_s numpix, bpp, basereg, mem_operand
+-.if bpp > 0
+- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
++.if \bpp > 0
++ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
+ .endif
+ .endm
+
+ .macro vuzp8 reg1, reg2
+- vuzp.8 d&reg1, d&reg2
++ vuzp.8 d\()\reg1, d\()\reg2
+ .endm
+
+ .macro vzip8 reg1, reg2
+- vzip.8 d&reg1, d&reg2
++ vzip.8 d\()\reg1, d\()\reg2
+ .endm
+
+ /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+ .macro pixdeinterleave bpp, basereg
+-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+- vuzp8 %(basereg+0), %(basereg+1)
+- vuzp8 %(basereg+2), %(basereg+3)
+- vuzp8 %(basereg+1), %(basereg+3)
+- vuzp8 %(basereg+0), %(basereg+2)
++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++ vuzp8 %(\basereg+0), %(\basereg+1)
++ vuzp8 %(\basereg+2), %(\basereg+3)
++ vuzp8 %(\basereg+1), %(\basereg+3)
++ vuzp8 %(\basereg+0), %(\basereg+2)
+ .endif
+ .endm
+
+ /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+ .macro pixinterleave bpp, basereg
+-.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+- vzip8 %(basereg+0), %(basereg+2)
+- vzip8 %(basereg+1), %(basereg+3)
+- vzip8 %(basereg+2), %(basereg+3)
+- vzip8 %(basereg+0), %(basereg+1)
++.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
++ vzip8 %(\basereg+0), %(\basereg+2)
++ vzip8 %(\basereg+1), %(\basereg+3)
++ vzip8 %(\basereg+2), %(\basereg+3)
++ vzip8 %(\basereg+0), %(\basereg+1)
+ .endif
+ .endm
+
+ /*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+@@ -389,51 +389,51 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+ .macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+- a x
++ \a \x
+ .endif
+ .endm
+
+ .macro cache_preload std_increment, boost_increment
+ .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+ .if regs_shortage
+- PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
++ PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+ .endif
+-.if std_increment != 0
+- PF add PF_X, PF_X, #std_increment
++.if \std_increment != 0
++ PF add, PF_X, PF_X, #\std_increment
+ .endif
+- PF tst PF_CTL, #0xF
+- PF addne PF_X, PF_X, #boost_increment
+- PF subne PF_CTL, PF_CTL, #1
+- PF cmp PF_X, ORIG_W
++ PF tst, PF_CTL, #0xF
++ PF addne, PF_X, PF_X, #\boost_increment
++ PF subne, PF_CTL, PF_CTL, #1
++ PF cmp, PF_X, ORIG_W
+ .if src_bpp_shift >= 0
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ .endif
+ .if dst_r_bpp != 0
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ .endif
+ .if mask_bpp_shift >= 0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ .endif
+- PF subge PF_X, PF_X, ORIG_W
+- PF subges PF_CTL, PF_CTL, #0x10
++ PF subge, PF_X, PF_X, ORIG_W
++ PF subges, PF_CTL, PF_CTL, #0x10
+ .if src_bpp_shift >= 0
+- PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ .endif
+ .if dst_r_bpp != 0
+- PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ .endif
+ .if mask_bpp_shift >= 0
+- PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
++ PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ .endif
+ .endif
+ .endm
+
+ .macro cache_preload_simple
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+ .if src_bpp > 0
+ pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+@@ -460,51 +460,53 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED
+ .macro ensure_destination_ptr_alignment process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+ .if dst_w_bpp != 24
+ tst DST_R, #0xF
+ beq 2f
+
+ .irp lowbit, 1, 2, 4, 8, 16
++#ifndef __clang__
+ local skip1
+-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+-.if lowbit < 16 /* we don't need more than 16-byte alignment */
+- tst DST_R, #lowbit
++#endif
++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
++.if \lowbit < 16 /* we don't need more than 16-byte alignment */
++ tst DST_R, #\lowbit
+ beq 1f
+ .endif
+- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
++ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
++ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+ .if dst_r_bpp > 0
+- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
++ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+ .else
+- add DST_R, DST_R, #lowbit
++ add DST_R, DST_R, #\lowbit
+ .endif
+- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+- sub W, W, #(lowbit * 8 / dst_w_bpp)
++ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
++ sub W, W, #(\lowbit * 8 / dst_w_bpp)
+ 1:
+ .endif
+ .endr
+ pixdeinterleave src_bpp, src_basereg
+ pixdeinterleave mask_bpp, mask_basereg
+ pixdeinterleave dst_r_bpp, dst_r_basereg
+
+- process_pixblock_head
++ \process_pixblock_head
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+- process_pixblock_tail
++ \process_pixblock_tail
+
+ pixinterleave dst_w_bpp, dst_w_basereg
+ .irp lowbit, 1, 2, 4, 8, 16
+-.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+-.if lowbit < 16 /* we don't need more than 16-byte alignment */
+- tst DST_W, #lowbit
++.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
++.if \lowbit < 16 /* we don't need more than 16-byte alignment */
++ tst DST_W, #\lowbit
+ beq 1f
+ .endif
+- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
++ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+ 1:
+ .endif
+ .endr
+ .endif
+ 2:
+ .endm
+
+ /*
+@@ -525,51 +527,51 @@ 2:
+ .macro process_trailing_pixels cache_preload_flag, \
+ dst_aligned_flag, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+ tst W, #(pixblock_size - 1)
+ beq 2f
+ .irp chunk_size, 16, 8, 4, 2, 1
+-.if pixblock_size > chunk_size
+- tst W, #chunk_size
++.if pixblock_size > \chunk_size
++ tst W, #\chunk_size
+ beq 1f
+- pixld_src chunk_size, src_bpp, src_basereg, SRC
+- pixld chunk_size, mask_bpp, mask_basereg, MASK
+-.if dst_aligned_flag != 0
+- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
++ pixld_src \chunk_size, src_bpp, src_basereg, SRC
++ pixld \chunk_size, mask_bpp, mask_basereg, MASK
++.if \dst_aligned_flag != 0
++ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+ .else
+- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
++ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+ .endif
+-.if cache_preload_flag != 0
+- PF add PF_X, PF_X, #chunk_size
++.if \cache_preload_flag != 0
++ PF add, PF_X, PF_X, #\chunk_size
+ .endif
+ 1:
+ .endif
+ .endr
+ pixdeinterleave src_bpp, src_basereg
+ pixdeinterleave mask_bpp, mask_basereg
+ pixdeinterleave dst_r_bpp, dst_r_basereg
+
+- process_pixblock_head
+-.if cache_preload_flag != 0
++ \process_pixblock_head
++.if \cache_preload_flag != 0
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+ .endif
+- process_pixblock_tail
++ \process_pixblock_tail
+ pixinterleave dst_w_bpp, dst_w_basereg
+ .irp chunk_size, 16, 8, 4, 2, 1
+-.if pixblock_size > chunk_size
+- tst W, #chunk_size
++.if pixblock_size > \chunk_size
++ tst W, #\chunk_size
+ beq 1f
+-.if dst_aligned_flag != 0
+- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
++.if \dst_aligned_flag != 0
++ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+ .else
+- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
++ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+ .endif
+ 1:
+ .endif
+ .endr
+ 2:
+ .endm
+
+ /*
+@@ -599,17 +601,17 @@ 2:
+ .if (mask_bpp != 24) && (mask_bpp != 0)
+ sub MASK, MASK, W, lsl #mask_bpp_shift
+ .endif
+ subs H, H, #1
+ mov DST_R, DST_W
+ .if regs_shortage
+ str H, [sp, #4] /* save updated height to stack */
+ .endif
+- bge start_of_loop_label
++ bge \start_of_loop_label
+ .endm
+
+ /*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3 - reserved for loading source pixel data
+ * d4, d5, d6, d7 - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+@@ -626,48 +628,48 @@ 2:
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head, \
+ dst_w_basereg_ = 28, \
+ dst_r_basereg_ = 4, \
+ src_basereg_ = 0, \
+ mask_basereg_ = 24
+
+- pixman_asm_function fname
++ pixman_asm_function \fname
+
+ push {r4-r12, lr} /* save all registers */
+
+ /*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+-.if prefetch_distance == 0
++.if \prefetch_distance == 0
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
++ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+ .endif
+
+ /*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+- .set src_bpp, src_bpp_
+- .set mask_bpp, mask_bpp_
+- .set dst_w_bpp, dst_w_bpp_
+- .set pixblock_size, pixblock_size_
+- .set dst_w_basereg, dst_w_basereg_
+- .set dst_r_basereg, dst_r_basereg_
+- .set src_basereg, src_basereg_
+- .set mask_basereg, mask_basereg_
++ .set src_bpp, \src_bpp_
++ .set mask_bpp, \mask_bpp_
++ .set dst_w_bpp, \dst_w_bpp_
++ .set pixblock_size, \pixblock_size_
++ .set dst_w_basereg, \dst_w_basereg_
++ .set dst_r_basereg, \dst_r_basereg_
++ .set src_basereg, \src_basereg_
++ .set mask_basereg, \mask_basereg_
+
+ .macro pixld_src x:vararg
+- pixld x
++ pixld \x
+ .endm
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+ /*
+ * Assign symbolic names to registers
+ */
+@@ -750,38 +752,38 @@ 2:
+ .elseif dst_w_bpp == 16
+ .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+ .set dst_bpp_shift, 0
+ .else
+ .error "requested dst bpp (dst_w_bpp) is not supported"
+ .endif
+
+-.if (((flags) & FLAG_DST_READWRITE) != 0)
++.if (((\flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+ .else
+ .set dst_r_bpp, 0
+ .endif
+-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+ .set DEINTERLEAVE_32BPP_ENABLED, 1
+ .else
+ .set DEINTERLEAVE_32BPP_ENABLED, 0
+ .endif
+
+-.if prefetch_distance < 0 || prefetch_distance > 15
+- .error "invalid prefetch distance (prefetch_distance)"
++.if \prefetch_distance < 0 || \prefetch_distance > 15
++ .error "invalid prefetch distance (\prefetch_distance)"
+ .endif
+
+ .if src_bpp > 0
+ ldr SRC, [sp, #40]
+ .endif
+ .if mask_bpp > 0
+ ldr MASK, [sp, #48]
+ .endif
+- PF mov PF_X, #0
++ PF mov, PF_X, #0
+ .if src_bpp > 0
+ ldr SRC_STRIDE, [sp, #44]
+ .endif
+ .if mask_bpp > 0
+ ldr MASK_STRIDE, [sp, #52]
+ .endif
+ mov DST_R, DST_W
+
+@@ -796,24 +798,24 @@ 2:
+ .if dst_w_bpp == 24
+ sub DST_STRIDE, DST_STRIDE, W
+ sub DST_STRIDE, DST_STRIDE, W, lsl #1
+ .endif
+
+ /*
+ * Setup advanced prefetcher initial state
+ */
+- PF mov PF_SRC, SRC
+- PF mov PF_DST, DST_R
+- PF mov PF_MASK, MASK
++ PF mov, PF_SRC, SRC
++ PF mov, PF_DST, DST_R
++ PF mov, PF_MASK, MASK
+ /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+- PF mov PF_CTL, H, lsl #4
+- PF add PF_CTL, #(prefetch_distance - 0x10)
++ PF mov, PF_CTL, H, lsl #4
++ PF add, PF_CTL, #(\prefetch_distance - 0x10)
+
+- init
++ \init
+ .if regs_shortage
+ push {r0, r1}
+ .endif
+ subs H, H, #1
+ .if regs_shortage
+ str H, [sp, #4] /* save updated height to stack */
+ .else
+ mov ORIG_W, W
+@@ -821,84 +823,84 @@ 2:
+ blt 9f
+ cmp W, #(pixblock_size * 2)
+ blt 8f
+ /*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+ 0:
+- ensure_destination_ptr_alignment process_pixblock_head, \
+- process_pixblock_tail, \
+- process_pixblock_tail_head
++ ensure_destination_ptr_alignment \process_pixblock_head, \
++ \process_pixblock_tail, \
++ \process_pixblock_tail_head
+
+ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+ pixld_a pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+- PF add PF_X, PF_X, #pixblock_size
+- process_pixblock_head
++ PF add, PF_X, PF_X, #pixblock_size
++ \process_pixblock_head
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+ subs W, W, #(pixblock_size * 2)
+ blt 2f
+ 1:
+- process_pixblock_tail_head
++ \process_pixblock_tail_head
+ cache_preload_simple
+ subs W, W, #pixblock_size
+ bge 1b
+ 2:
+- process_pixblock_tail
++ \process_pixblock_tail
+ pixst_a pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+ /* Process the remaining trailing pixels in the scanline */
+ process_trailing_pixels 1, 1, \
+- process_pixblock_head, \
+- process_pixblock_tail, \
+- process_pixblock_tail_head
++ \process_pixblock_head, \
++ \process_pixblock_tail, \
++ \process_pixblock_tail_head
+ advance_to_next_scanline 0b
+
+ .if regs_shortage
+ pop {r0, r1}
+ .endif
+- cleanup
++ \cleanup
+ pop {r4-r12, pc} /* exit */
+ /*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+ 8:
+ /* Process exactly pixblock_size pixels if needed */
+ tst W, #pixblock_size
+ beq 1f
+ pixld pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+- process_pixblock_head
+- process_pixblock_tail
++ \process_pixblock_head
++ \process_pixblock_tail
+ pixst pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+ 1:
+ /* Process the remaining trailing pixels in the scanline */
+ process_trailing_pixels 0, 0, \
+- process_pixblock_head, \
+- process_pixblock_tail, \
+- process_pixblock_tail_head
++ \process_pixblock_head, \
++ \process_pixblock_tail, \
++ \process_pixblock_tail_head
+ advance_to_next_scanline 8b
+ 9:
+ .if regs_shortage
+ pop {r0, r1}
+ .endif
+- cleanup
++ \cleanup
+ pop {r4-r12, pc} /* exit */
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
+ .unreq SRC
+ .unreq MASK
+ .unreq DST_R
+@@ -910,17 +912,17 @@ 9:
+ .unreq DST_STRIDE
+ .unreq MASK_STRIDE
+ .unreq PF_CTL
+ .unreq PF_X
+ .unreq PF_SRC
+ .unreq PF_DST
+ .unreq PF_MASK
+ .unreq DUMMY
+- .endfunc
++ pixman_end_asm_function
+ .endm
+
+ /*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+ .macro generate_composite_function_scanline use_nearest_scaling, \
+ fname, \
+@@ -934,49 +936,49 @@ 9:
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head, \
+ dst_w_basereg_ = 28, \
+ dst_r_basereg_ = 4, \
+ src_basereg_ = 0, \
+ mask_basereg_ = 24
+
+- pixman_asm_function fname
++ pixman_asm_function \fname
+
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ /*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+- .set src_bpp, src_bpp_
+- .set mask_bpp, mask_bpp_
+- .set dst_w_bpp, dst_w_bpp_
+- .set pixblock_size, pixblock_size_
+- .set dst_w_basereg, dst_w_basereg_
+- .set dst_r_basereg, dst_r_basereg_
+- .set src_basereg, src_basereg_
+- .set mask_basereg, mask_basereg_
++ .set src_bpp, \src_bpp_
++ .set mask_bpp, \mask_bpp_
++ .set dst_w_bpp, \dst_w_bpp_
++ .set pixblock_size, \pixblock_size_
++ .set dst_w_basereg, \dst_w_basereg_
++ .set dst_r_basereg, \dst_r_basereg_
++ .set src_basereg, \src_basereg_
++ .set mask_basereg, \mask_basereg_
+
+-.if use_nearest_scaling != 0
++.if \use_nearest_scaling != 0
+ /*
+ * Assign symbolic names to registers for nearest scaling
+ */
+ W .req r0
+ DST_W .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ MASK .req lr
+ TMP1 .req r4
+ TMP2 .req r5
+ DST_R .req r6
+ SRC_WIDTH_FIXED .req r7
+
+ .macro pixld_src x:vararg
+- pixld_s x
++ pixld_s \x
+ .endm
+
+ ldr UNIT_X, [sp]
+ push {r4-r8, lr}
+ ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)]
+ .if mask_bpp != 0
+ ldr MASK, [sp, #(24 + 8)]
+ .endif
+@@ -986,89 +988,89 @@ 9:
+ */
+ W .req r0 /* width (is updated during processing) */
+ DST_W .req r1 /* destination buffer pointer for writes */
+ SRC .req r2 /* source buffer pointer */
+ DST_R .req ip /* destination buffer pointer for reads */
+ MASK .req r3 /* mask pointer */
+
+ .macro pixld_src x:vararg
+- pixld x
++ pixld \x
+ .endm
+ .endif
+
+-.if (((flags) & FLAG_DST_READWRITE) != 0)
++.if (((\flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+ .else
+ .set dst_r_bpp, 0
+ .endif
+-.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
++.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+ .set DEINTERLEAVE_32BPP_ENABLED, 1
+ .else
+ .set DEINTERLEAVE_32BPP_ENABLED, 0
+ .endif
+
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+
+- init
++ \init
+ mov DST_R, DST_W
+
+ cmp W, #pixblock_size
+ blt 8f
+
+- ensure_destination_ptr_alignment process_pixblock_head, \
+- process_pixblock_tail, \
+- process_pixblock_tail_head
++ ensure_destination_ptr_alignment \process_pixblock_head, \
++ \process_pixblock_tail, \
++ \process_pixblock_tail_head
+
+ subs W, W, #pixblock_size
+ blt 7f
+
+ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+ pixld_a pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+- process_pixblock_head
++ \process_pixblock_head
+ subs W, W, #pixblock_size
+ blt 2f
+ 1:
+- process_pixblock_tail_head
++ \process_pixblock_tail_head
+ subs W, W, #pixblock_size
+ bge 1b
+ 2:
+- process_pixblock_tail
++ \process_pixblock_tail
+ pixst_a pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+ 7:
+ /* Process the remaining trailing pixels in the scanline (dst aligned) */
+ process_trailing_pixels 0, 1, \
+- process_pixblock_head, \
+- process_pixblock_tail, \
+- process_pixblock_tail_head
++ \process_pixblock_head, \
++ \process_pixblock_tail, \
++ \process_pixblock_tail_head
+
+- cleanup
+-.if use_nearest_scaling != 0
++ \cleanup
++.if \use_nearest_scaling != 0
+ pop {r4-r8, pc} /* exit */
+ .else
+ bx lr /* exit */
+ .endif
+ 8:
+ /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+ process_trailing_pixels 0, 0, \
+- process_pixblock_head, \
+- process_pixblock_tail, \
+- process_pixblock_tail_head
++ \process_pixblock_head, \
++ \process_pixblock_tail, \
++ \process_pixblock_tail_head
+
+- cleanup
++ \cleanup
+
+-.if use_nearest_scaling != 0
++.if \use_nearest_scaling != 0
+ pop {r4-r8, pc} /* exit */
+
+ .unreq DST_R
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+@@ -1085,25 +1087,25 @@ 8:
+ .unreq DST_R
+ .unreq DST_W
+ .unreq W
+ .endif
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
+- .endfunc
++ pixman_end_asm_function
+ .endm
+
+ .macro generate_composite_function_single_scanline x:vararg
+- generate_composite_function_scanline 0, x
++ generate_composite_function_scanline 0, \x
+ .endm
+
+ .macro generate_composite_function_nearest_scanline x:vararg
+- generate_composite_function_scanline 1, x
++ generate_composite_function_scanline 1, \x
+ .endm
+
+ /* Default prologue/epilogue, nothing special needs to be done */
+
+ .macro default_init
+ .endm
+
+ .macro default_cleanup
+@@ -1129,56 +1131,56 @@ 8:
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ * value (in) is lost.
+ */
+ .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+- vshrn.u16 out_r, in, #8
+- vshrn.u16 out_g, in, #3
+- vsli.u16 in, in, #5
+- vmov.u8 out_a, #255
+- vsri.u8 out_r, out_r, #5
+- vsri.u8 out_g, out_g, #6
+- vshrn.u16 out_b, in, #2
++ vshrn.u16 \out_r, \in, #8
++ vshrn.u16 \out_g, \in, #3
++ vsli.u16 \in, \in, #5
++ vmov.u8 \out_a, #255
++ vsri.u8 \out_r, \out_r, #5
++ vsri.u8 \out_g, \out_g, #6
++ vshrn.u16 \out_b, \in, #2
+ .endm
+
+ .macro convert_0565_to_x888 in, out_r, out_g, out_b
+- vshrn.u16 out_r, in, #8
+- vshrn.u16 out_g, in, #3
+- vsli.u16 in, in, #5
+- vsri.u8 out_r, out_r, #5
+- vsri.u8 out_g, out_g, #6
+- vshrn.u16 out_b, in, #2
++ vshrn.u16 \out_r, \in, #8
++ vshrn.u16 \out_g, \in, #3
++ vsli.u16 \in, \in, #5
++ vsri.u8 \out_r, \out_r, #5
++ vsri.u8 \out_g, \out_g, #6
++ vshrn.u16 \out_b, \in, #2
+ .endm
+
+ /*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+ .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+- vshll.u8 tmp1, in_g, #8
+- vshll.u8 out, in_r, #8
+- vshll.u8 tmp2, in_b, #8
+- vsri.u16 out, tmp1, #5
+- vsri.u16 out, tmp2, #11
++ vshll.u8 \tmp1, \in_g, #8
++ vshll.u8 \out, \in_r, #8
++ vshll.u8 \tmp2, \in_b, #8
++ vsri.u16 \out, \tmp1, #5
++ vsri.u16 \out, \tmp2, #11
+ .endm
+
+ /*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+ .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+- vshl.u16 out0, in, #5 /* G top 6 bits */
+- vshl.u16 tmp, in, #11 /* B top 5 bits */
+- vsri.u16 in, in, #5 /* R is ready in top bits */
+- vsri.u16 out0, out0, #6 /* G is ready in top bits */
+- vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
+- vshr.u16 out1, in, #8 /* R is in place */
+- vsri.u16 out0, tmp, #8 /* G & B is in place */
+- vzip.u16 out0, out1 /* everything is in place */
++ vshl.u16 \out0, \in, #5 /* G top 6 bits */
++ vshl.u16 \tmp, \in, #11 /* B top 5 bits */
++ vsri.u16 \in, \in, #5 /* R is ready in top bits */
++ vsri.u16 \out0, \out0, #6 /* G is ready in top bits */
++ vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */
++ vshr.u16 \out1, \in, #8 /* R is in place */
++ vsri.u16 \out0, \tmp, #8 /* G & B is in place */
++ vzip.u16 \out0, \out1 /* everything is in place */
+ .endm
+diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
+--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
++++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
+@@ -20,16 +20,20 @@
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
++#ifdef __clang__
++#define subpls subspl
++#endif
++
+ /* Prevent the stack from becoming executable */
+ #if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+ #endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+@@ -57,100 +61,105 @@
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
+ */
+
+ .macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+-pixman_asm_function fname
++pixman_asm_function \fname
+ W .req r0
+ DST .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ VXMASK .req r6
+ PF_OFFS .req r7
+ SRC_WIDTH_FIXED .req r8
+
+ ldr UNIT_X, [sp]
+ push {r4, r5, r6, r7, r8, r10}
+- mvn VXMASK, #((1 << bpp_shift) - 1)
++ mvn VXMASK, #((1 << \bpp_shift) - 1)
+ ldr SRC_WIDTH_FIXED, [sp, #28]
+
+ /* define helper macro */
+ .macro scale_2_pixels
+- ldr&t TMP1, [SRC, TMP1]
+- and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
++ ldr\()\t TMP1, [SRC, TMP1]
++ and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
+ adds VX, VX, UNIT_X
+- str&t TMP1, [DST], #(1 << bpp_shift)
++ str\()\t TMP1, [DST], #(1 << \bpp_shift)
+ 9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
+
+- ldr&t TMP2, [SRC, TMP2]
+- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
++ ldr\()\t TMP2, [SRC, TMP2]
++ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
+ adds VX, VX, UNIT_X
+- str&t TMP2, [DST], #(1 << bpp_shift)
++ str\()\t TMP2, [DST], #(1 << \bpp_shift)
+ 9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
+ .endm
+
+ /* now do the scaling */
+- and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
++ and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
+ adds VX, VX, UNIT_X
+ 9: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 9b
+- subs W, W, #(8 + prefetch_braking_distance)
++ subs W, W, #(8 + \prefetch_braking_distance)
+ blt 2f
+ /* calculate prefetch offset */
+- mov PF_OFFS, #prefetch_distance
++ mov PF_OFFS, #\prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
+ 1: /* main loop, process 8 pixels per iteration with prefetch */
+- pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
++ pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
+ add PF_OFFS, UNIT_X, lsl #3
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ subs W, W, #8
+ bge 1b
+ 2:
+- subs W, W, #(4 - 8 - prefetch_braking_distance)
++ subs W, W, #(4 - 8 - \prefetch_braking_distance)
+ blt 2f
+ 1: /* process the remaining pixels */
+ scale_2_pixels
+ scale_2_pixels
+ subs W, W, #4
+ bge 1b
+ 2:
+ tst W, #2
+ beq 2f
+ scale_2_pixels
+ 2:
+ tst W, #1
+- ldrne&t TMP1, [SRC, TMP1]
+- strne&t TMP1, [DST]
++#ifdef __clang__
++ ldr\()\t\()ne TMP1, [SRC, TMP1]
++ str\()\t\()ne TMP1, [DST]
++#else
++ ldrne\()\t TMP1, [SRC, TMP1]
++ strne\()\t TMP1, [DST]
++#endif
+ /* cleanup helper macro */
+ .purgem scale_2_pixels
+ .unreq DST
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq VXMASK
+ .unreq PF_OFFS
+ .unreq SRC_WIDTH_FIXED
+ /* return */
+ pop {r4, r5, r6, r7, r8, r10}
+ bx lr
+-.endfunc
++pixman_end_asm_function
+ .endm
+
+ generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+ generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
+diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
+--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
++++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
+@@ -20,16 +20,21 @@
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
++#ifdef __clang__
++#define adceqs adcseq
++#define ldmnedb ldmdbne
++#endif
++
+ /* Prevent the stack from becoming executable */
+ #if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+ #endif
+
+ .text
+ .arch armv6
+ .object_arch armv4
+@@ -52,26 +57,26 @@
+ * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
+ */
+
+ .macro blit_init
+ line_saved_regs STRIDE_D, STRIDE_S
+ .endm
+
+ .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- pixld cond, numbytes, firstreg, SRC, unaligned_src
++ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
+ .endm
+
+ .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+-110: pixld , 16, 0, SRC, unaligned_src
+- pixld , 16, 4, SRC, unaligned_src
++110: pixld , 16, 0, SRC, \unaligned_src
++ pixld , 16, 4, SRC, \unaligned_src
+ pld [SRC, SCRATCH]
+ pixst , 16, 0, DST
+ pixst , 16, 4, DST
+ subs X, X, #32*8/src_bpp
+ bhs 110b
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+@@ -137,17 +142,17 @@ generate_composite_function \
+ mov STRIDE_M, SRC
+ .endm
+
+ .macro fill_process_tail cond, numbytes, firstreg
+ WK4 .req SRC
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+- pixst cond, numbytes, 4, DST
++ pixst \cond, \numbytes, 4, DST
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+ .endm
+
+ generate_composite_function \
+ pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+@@ -177,30 +182,30 @@ generate_composite_function \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ fill_process_tail
+
+ /******************************************************************************/
+
+ .macro src_x888_8888_pixel, cond, reg
+- orr&cond WK&reg, WK&reg, #0xFF000000
++ orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
+ .endm
+
+ .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- pixld cond, numbytes, firstreg, SRC, unaligned_src
++ pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
+ .endm
+
+ .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
+- src_x888_8888_pixel cond, %(firstreg+0)
+- .if numbytes >= 8
+- src_x888_8888_pixel cond, %(firstreg+1)
+- .if numbytes == 16
+- src_x888_8888_pixel cond, %(firstreg+2)
+- src_x888_8888_pixel cond, %(firstreg+3)
++ src_x888_8888_pixel \cond, %(\firstreg+0)
++ .if \numbytes >= 8
++ src_x888_8888_pixel \cond, %(\firstreg+1)
++ .if \numbytes == 16
++ src_x888_8888_pixel \cond, %(\firstreg+2)
++ src_x888_8888_pixel \cond, %(\firstreg+3)
+ .endif
+ .endif
+ .endm
+
+ generate_composite_function \
+ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 3, /* prefetch distance */ \
+@@ -217,83 +222,83 @@ generate_composite_function \
+ ldr MASK, =0x07E007E0
+ mov STRIDE_M, #0xFF000000
+ /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+ ldr SCRATCH, =0x80008000
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+ .endm
+
+ .macro src_0565_8888_2pixels, reg1, reg2
+- and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+- bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+- mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
+- mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
+- bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+- orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
+- orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
+- pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
+- sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
+- mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
+- pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
+- sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+- orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+- orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
++ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
++ bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
++ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
++ mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
++ mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
++ bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
++ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
++ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
++ pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
++ sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
++ mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
++ pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
++ sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
++ orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
++ orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+ .endm
+
+ /* This version doesn't need STRIDE_M, but is one instruction longer.
+ It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+- and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
+- bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
+- orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
+- mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
+- mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
+- bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+- mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
+- mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+- orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
+- orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+- pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
+- pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+- sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+- sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
+- orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+- orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
++ and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
++ bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
++ orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
++ mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
++ mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
++ bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
++ mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
++ mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
++ orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
++ orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
++ pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
++ pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
++ sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
++ sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb
++ orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
++ orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ */
+
+ .macro src_0565_8888_1pixel, reg
+- bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
+- and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
+- mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
+- mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
+- orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
+- orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
+- pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
+- sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
+- orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
++ bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb
++ and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000
++ mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
++ mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000
++ orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
++ orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000
++ pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
++ sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
++ orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
+ .endm
+
+ .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- .if numbytes == 16
+- pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+- .elseif numbytes == 8
+- pixld , 4, firstreg, SRC, unaligned_src
+- .elseif numbytes == 4
+- pixld , 2, firstreg, SRC, unaligned_src
++ .if \numbytes == 16
++ pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
++ .elseif \numbytes == 8
++ pixld , 4, \firstreg, SRC, \unaligned_src
++ .elseif \numbytes == 4
++ pixld , 2, \firstreg, SRC, \unaligned_src
+ .endif
+ .endm
+
+ .macro src_0565_8888_process_tail cond, numbytes, firstreg
+- .if numbytes == 16
+- src_0565_8888_2pixels firstreg, %(firstreg+1)
+- src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+- .elseif numbytes == 8
+- src_0565_8888_2pixels firstreg, %(firstreg+1)
++ .if \numbytes == 16
++ src_0565_8888_2pixels \firstreg, %(\firstreg+1)
++ src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
++ .elseif \numbytes == 8
++ src_0565_8888_2pixels \firstreg, %(\firstreg+1)
+ .else
+- src_0565_8888_1pixel firstreg
++ src_0565_8888_1pixel \firstreg
+ .endif
+ .endm
+
+ generate_composite_function \
+ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+ 3, /* prefetch distance */ \
+ src_0565_8888_init, \
+@@ -306,67 +311,67 @@ generate_composite_function \
+
+ .macro src_x888_0565_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x001F001F
+ line_saved_regs STRIDE_S, ORIG_W
+ .endm
+
+ .macro src_x888_0565_1pixel s, d
+- and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+- and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
+- orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+- orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
++ and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
++ and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000
++ orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
++ orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+ /* Top 16 bits are discarded during the following STRH */
+ .endm
+
+ .macro src_x888_0565_2pixels slo, shi, d, tmp
+- and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
+- and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
+- and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
+- orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
+- orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
+- and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
+- orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
+- orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
+- pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
++ and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
++ and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
++ and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
++ orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
++ orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
++ and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000
++ orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
++ orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
++ pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+ .endm
+
+ .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_S
+ WK5 .req STRIDE_M
+ WK6 .req WK3
+ WK7 .req ORIG_W
+- .if numbytes == 16
++ .if \numbytes == 16
+ pixld , 16, 4, SRC, 0
+ src_x888_0565_2pixels 4, 5, 0, 0
+ pixld , 8, 4, SRC, 0
+ src_x888_0565_2pixels 6, 7, 1, 1
+ pixld , 8, 6, SRC, 0
+ .else
+- pixld , numbytes*2, 4, SRC, 0
++ pixld , \numbytes*2, 4, SRC, 0
+ .endif
+ .endm
+
+ .macro src_x888_0565_process_tail cond, numbytes, firstreg
+- .if numbytes == 16
++ .if \numbytes == 16
+ src_x888_0565_2pixels 4, 5, 2, 2
+ src_x888_0565_2pixels 6, 7, 3, 4
+- .elseif numbytes == 8
++ .elseif \numbytes == 8
+ src_x888_0565_2pixels 4, 5, 1, 1
+ src_x888_0565_2pixels 6, 7, 2, 2
+- .elseif numbytes == 4
++ .elseif \numbytes == 4
+ src_x888_0565_2pixels 4, 5, 1, 1
+ .else
+ src_x888_0565_1pixel 4, 1
+ .endif
+- .if numbytes == 16
+- pixst , numbytes, 0, DST
++ .if \numbytes == 16
++ pixst , \numbytes, 0, DST
+ .else
+- pixst , numbytes, 1, DST
++ pixst , \numbytes, 1, DST
+ .endif
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+ .endm
+
+ generate_composite_function \
+@@ -377,47 +382,47 @@ generate_composite_function \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ src_x888_0565_process_head, \
+ src_x888_0565_process_tail
+
+ /******************************************************************************/
+
+ .macro add_8_8_8pixels cond, dst1, dst2
+- uqadd8&cond WK&dst1, WK&dst1, MASK
+- uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
++ uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK
++ uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M
+ .endm
+
+ .macro add_8_8_4pixels cond, dst
+- uqadd8&cond WK&dst, WK&dst, MASK
++ uqadd8\()\cond WK\()\dst, WK\()\dst, MASK
+ .endm
+
+ .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req MASK
+ WK5 .req STRIDE_M
+- .if numbytes == 16
+- pixld cond, 8, 4, SRC, unaligned_src
+- pixld cond, 16, firstreg, DST, 0
+- add_8_8_8pixels cond, firstreg, %(firstreg+1)
+- pixld cond, 8, 4, SRC, unaligned_src
++ .if \numbytes == 16
++ pixld \cond, 8, 4, SRC, \unaligned_src
++ pixld \cond, 16, \firstreg, DST, 0
++ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
++ pixld \cond, 8, 4, SRC, \unaligned_src
+ .else
+- pixld cond, numbytes, 4, SRC, unaligned_src
+- pixld cond, numbytes, firstreg, DST, 0
++ pixld \cond, \numbytes, 4, SRC, \unaligned_src
++ pixld \cond, \numbytes, \firstreg, DST, 0
+ .endif
+ .unreq WK4
+ .unreq WK5
+ .endm
+
+ .macro add_8_8_process_tail cond, numbytes, firstreg
+- .if numbytes == 16
+- add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+- .elseif numbytes == 8
+- add_8_8_8pixels cond, firstreg, %(firstreg+1)
++ .if \numbytes == 16
++ add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
++ .elseif \numbytes == 8
++ add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
+ .else
+- add_8_8_4pixels cond, firstreg
++ add_8_8_4pixels \cond, \firstreg
+ .endif
+ .endm
+
+ generate_composite_function \
+ pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 2, /* prefetch distance */ \
+ nop_macro, /* init */ \
+@@ -436,82 +441,82 @@ generate_composite_function \
+ line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+ .endm
+
+ .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req STRIDE_M
+ WK7 .req ORIG_W
+- pixld , numbytes, %(4+firstreg), SRC, unaligned_src
+- pixld , numbytes, firstreg, DST, 0
++ pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src
++ pixld , \numbytes, \firstreg, DST, 0
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+ .endm
+
+ .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
+ /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
+- teq WK&reg0, #0
+- .if numbytes > 4
+- teqeq WK&reg1, #0
+- .if numbytes > 8
+- teqeq WK&reg2, #0
+- teqeq WK&reg3, #0
++ teq WK\()\reg0, #0
++ .if \numbytes > 4
++ teqeq WK\()\reg1, #0
++ .if \numbytes > 8
++ teqeq WK\()\reg2, #0
++ teqeq WK\()\reg3, #0
+ .endif
+ .endif
+ .endm
+
+ .macro over_8888_8888_prepare next
+- mov WK&next, WK&next, lsr #24
++ mov WK\()\next, WK\()\next, lsr #24
+ .endm
+
+ .macro over_8888_8888_1pixel src, dst, offset, next
+ /* src = destination component multiplier */
+- rsb WK&src, WK&src, #255
++ rsb WK\()\src, WK\()\src, #255
+ /* Split even/odd bytes of dst into SCRATCH/dst */
+- uxtb16 SCRATCH, WK&dst
+- uxtb16 WK&dst, WK&dst, ror #8
++ uxtb16 SCRATCH, WK\()\dst
++ uxtb16 WK\()\dst, WK\()\dst, ror #8
+ /* Multiply through, adding 0.5 to the upper byte of result for rounding */
+- mla SCRATCH, SCRATCH, WK&src, MASK
+- mla WK&dst, WK&dst, WK&src, MASK
++ mla SCRATCH, SCRATCH, WK\()\src, MASK
++ mla WK\()\dst, WK\()\dst, WK\()\src, MASK
+ /* Where we would have had a stall between the result of the first MLA and the shifter input,
+ * reload the complete source pixel */
+- ldr WK&src, [SRC, #offset]
++ ldr WK\()\src, [SRC, #\offset]
+ /* Multiply by 257/256 to approximate 256/255 */
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ /* In this stall, start processing the next pixel */
+- .if offset < -4
+- mov WK&next, WK&next, lsr #24
++ .if \offset < -4
++ mov WK\()\next, WK\()\next, lsr #24
+ .endif
+- uxtab16 WK&dst, WK&dst, WK&dst, ror #8
++ uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
+ /* Recombine even/odd bytes of multiplied destination */
+ mov SCRATCH, SCRATCH, ror #8
+- sel WK&dst, SCRATCH, WK&dst
++ sel WK\()\dst, SCRATCH, WK\()\dst
+ /* Saturated add of source to multiplied destination */
+- uqadd8 WK&dst, WK&dst, WK&src
++ uqadd8 WK\()\dst, WK\()\dst, WK\()\src
+ .endm
+
+ .macro over_8888_8888_process_tail cond, numbytes, firstreg
+ WK4 .req STRIDE_D
+ WK5 .req STRIDE_S
+ WK6 .req STRIDE_M
+ WK7 .req ORIG_W
+- over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
++ over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
+ beq 10f
+- over_8888_8888_prepare %(4+firstreg)
+- .set PROCESS_REG, firstreg
+- .set PROCESS_OFF, -numbytes
+- .rept numbytes / 4
++ over_8888_8888_prepare %(4+\firstreg)
++ .set PROCESS_REG, \firstreg
++ .set PROCESS_OFF, -\numbytes
++ .rept \numbytes / 4
+ over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+- pixst , numbytes, firstreg, DST
++ pixst , \numbytes, \firstreg, DST
+ 10:
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+ .endm
+
+ generate_composite_function \
+@@ -531,26 +536,26 @@ generate_composite_function \
+ * word Register containing 4 bytes
+ * byte Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp Scratch register
+ * half Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
+ */
+ .macro mul_8888_8 word, byte, tmp, half
+ /* Split even/odd bytes of word apart */
+- uxtb16 tmp, word
+- uxtb16 word, word, ror #8
++ uxtb16 \tmp, \word
++ uxtb16 \word, \word, ror #8
+ /* Multiply bytes together with rounding, then by 257/256 */
+- mla tmp, tmp, byte, half
+- mla word, word, byte, half /* 1 stall follows */
+- uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
+- uxtab16 word, word, word, ror #8
++ mla \tmp, \tmp, \byte, \half
++ mla \word, \word, \byte, \half /* 1 stall follows */
++ uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */
++ uxtab16 \word, \word, \word, ror #8
+ /* Recombine bytes */
+- mov tmp, tmp, ror #8
+- sel word, tmp, word
++ mov \tmp, \tmp, ror #8
++ sel \word, \tmp, \word
+ .endm
+
+ /******************************************************************************/
+
+ .macro over_8888_n_8888_init
+ /* Mask is constant */
+ ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
+ /* Hold loop invariant in STRIDE_M */
+@@ -562,51 +567,51 @@ generate_composite_function \
+ line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+ .endm
+
+ .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req Y
+ WK5 .req STRIDE_D
+ WK6 .req STRIDE_S
+ WK7 .req ORIG_W
+- pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+- pixld , numbytes, firstreg, DST, 0
++ pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
++ pixld , \numbytes, \firstreg, DST, 0
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+ .endm
+
+ .macro over_8888_n_8888_1pixel src, dst
+- mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
+- sub WK7, WK6, WK&src, lsr #24
+- mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
+- uqadd8 WK&dst, WK&dst, WK&src
++ mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M
++ sub WK7, WK6, WK\()\src, lsr #24
++ mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M
++ uqadd8 WK\()\dst, WK\()\dst, WK\()\src
+ .endm
+
+ .macro over_8888_n_8888_process_tail cond, numbytes, firstreg
+ WK4 .req Y
+ WK5 .req STRIDE_D
+ WK6 .req STRIDE_S
+ WK7 .req ORIG_W
+- over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
++ over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
+ beq 10f
+ mov WK6, #255
+- .set PROCESS_REG, firstreg
+- .rept numbytes / 4
+- .if numbytes == 16 && PROCESS_REG == 2
++ .set PROCESS_REG, \firstreg
++ .rept \numbytes / 4
++ .if \numbytes == 16 && PROCESS_REG == 2
+ /* We're using WK6 and WK7 as temporaries, so half way through
+ * 4 pixels, reload the second two source pixels but this time
+ * into WK4 and WK5 */
+ ldmdb SRC, {WK4, WK5}
+ .endif
+ over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+- pixst , numbytes, firstreg, DST
++ pixst , \numbytes, \firstreg, DST
+ 10:
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+ .endm
+
+ generate_composite_function \
+@@ -637,47 +642,47 @@ generate_composite_function \
+ ldr STRIDE_D, =0x00800080
+ b 1f
+ .ltorg
+ 1:
+ .endm
+
+ .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ WK4 .req STRIDE_M
+- pixld , numbytes/4, 4, MASK, unaligned_mask
+- pixld , numbytes, firstreg, DST, 0
++ pixld , \numbytes/4, 4, MASK, \unaligned_mask
++ pixld , \numbytes, \firstreg, DST, 0
+ .unreq WK4
+ .endm
+
+ .macro over_n_8_8888_1pixel src, dst
+- uxtb Y, WK4, ror #src*8
++ uxtb Y, WK4, ror #\src*8
+ /* Trailing part of multiplication of source */
+ mla SCRATCH, STRIDE_S, Y, STRIDE_D
+ mla Y, SRC, Y, STRIDE_D
+ mov ORIG_W, #255
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 Y, Y, Y, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sub ORIG_W, ORIG_W, Y, lsr #24
+ sel Y, SCRATCH, Y
+ /* Then multiply the destination */
+- mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
+- uqadd8 WK&dst, WK&dst, Y
++ mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
++ uqadd8 WK\()\dst, WK\()\dst, Y
+ .endm
+
+ .macro over_n_8_8888_process_tail cond, numbytes, firstreg
+ WK4 .req STRIDE_M
+ teq WK4, #0
+ beq 10f
+- .set PROCESS_REG, firstreg
+- .rept numbytes / 4
+- over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
++ .set PROCESS_REG, \firstreg
++ .rept \numbytes / 4
++ over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+- pixst , numbytes, firstreg, DST
++ pixst , \numbytes, \firstreg, DST
+ 10:
+ .unreq WK4
+ .endm
+
+ generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+ 2, /* prefetch distance */ \
+@@ -700,64 +705,64 @@ generate_composite_function \
+ line_saved_regs STRIDE_D, ORIG_W
+ .endm
+
+ .macro over_reverse_n_8888_newline
+ mov STRIDE_D, #0xFF
+ .endm
+
+ .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- pixld , numbytes, firstreg, DST, 0
++ pixld , \numbytes, \firstreg, DST, 0
+ .endm
+
+ .macro over_reverse_n_8888_1pixel d, is_only
+- teq WK&d, #0
++ teq WK\()\d, #0
+ beq 8f /* replace with source */
+- bics ORIG_W, STRIDE_D, WK&d, lsr #24
+- .if is_only == 1
++ bics ORIG_W, STRIDE_D, WK\()\d, lsr #24
++ .if \is_only == 1
+ beq 49f /* skip store */
+ .else
+ beq 9f /* write same value back */
+ .endif
+ mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
+ mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sel ORIG_W, SCRATCH, ORIG_W
+- uqadd8 WK&d, WK&d, ORIG_W
++ uqadd8 WK\()\d, WK\()\d, ORIG_W
+ b 9f
+-8: mov WK&d, SRC
++8: mov WK\()\d, SRC
+ 9:
+ .endm
+
+ .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
+- .if numbytes == 4
+- over_reverse_n_8888_1pixel reg1, 1
++ .if \numbytes == 4
++ over_reverse_n_8888_1pixel \reg1, 1
+ .else
+- and SCRATCH, WK&reg1, WK&reg2
+- .if numbytes == 16
+- and SCRATCH, SCRATCH, WK&reg3
+- and SCRATCH, SCRATCH, WK&reg4
++ and SCRATCH, WK\()\reg1, WK\()\reg2
++ .if \numbytes == 16
++ and SCRATCH, SCRATCH, WK\()\reg3
++ and SCRATCH, SCRATCH, WK\()\reg4
+ .endif
+ mvns SCRATCH, SCRATCH, asr #24
+ beq 49f /* skip store if all opaque */
+- over_reverse_n_8888_1pixel reg1, 0
+- over_reverse_n_8888_1pixel reg2, 0
+- .if numbytes == 16
+- over_reverse_n_8888_1pixel reg3, 0
+- over_reverse_n_8888_1pixel reg4, 0
++ over_reverse_n_8888_1pixel \reg1, 0
++ over_reverse_n_8888_1pixel \reg2, 0
++ .if \numbytes == 16
++ over_reverse_n_8888_1pixel \reg3, 0
++ over_reverse_n_8888_1pixel \reg4, 0
+ .endif
+ .endif
+- pixst , numbytes, reg1, DST
++ pixst , \numbytes, \reg1, DST
+ 49:
+ .endm
+
+ .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
+- over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
++ over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
+ .endm
+
+ generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+ 3, /* prefetch distance */ \
+ over_reverse_n_8888_init, \
+ over_reverse_n_8888_newline, \
+@@ -789,30 +794,30 @@ generate_composite_function \
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq WK4
+ .endm
+
+ .macro over_white_8888_8888_ca_combine m, d
+ uxtb16 TMP1, TMP0 /* rb_notmask */
+- uxtb16 TMP2, d /* rb_dest; 1 stall follows */
++ uxtb16 TMP2, \d /* rb_dest; 1 stall follows */
+ smlatt TMP3, TMP2, TMP1, HALF /* red */
+ smlabb TMP2, TMP2, TMP1, HALF /* blue */
+ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
+- uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
+- smlatt d, TMP1, TMP0, HALF /* alpha */
++ uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */
++ smlatt \d, TMP1, TMP0, HALF /* alpha */
+ smlabb TMP1, TMP1, TMP0, HALF /* green */
+ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+- pkhbt TMP1, TMP1, d, lsl #16 /* ag */
++ pkhbt TMP1, TMP1, \d, lsl #16 /* ag */
+ uxtab16 TMP0, TMP0, TMP0, ror #8
+ uxtab16 TMP1, TMP1, TMP1, ror #8
+ mov TMP0, TMP0, ror #8
+- sel d, TMP0, TMP1
+- uqadd8 d, d, m /* d is a late result */
++ sel \d, TMP0, TMP1
++ uqadd8 \d, \d, \m /* d is a late result */
+ .endm
+
+ .macro over_white_8888_8888_ca_1pixel_head
+ pixld , 4, 1, MASK, 0
+ pixld , 4, 3, DST, 0
+ .endm
+
+ .macro over_white_8888_8888_ca_1pixel_tail
+@@ -848,29 +853,29 @@ 02: mvn TMP0, WK2
+ movcs WK4, WK2
+ b 04f
+ 03: over_white_8888_8888_ca_combine WK2, WK4
+ 04: pixst , 8, 3, DST
+ 05:
+ .endm
+
+ .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- .if numbytes == 4
++ .if \numbytes == 4
+ over_white_8888_8888_ca_1pixel_head
+ .else
+- .if numbytes == 16
++ .if \numbytes == 16
+ over_white_8888_8888_ca_2pixels_head
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+ over_white_8888_8888_ca_2pixels_head
+ .endif
+ .endm
+
+ .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
+- .if numbytes == 4
++ .if \numbytes == 4
+ over_white_8888_8888_ca_1pixel_tail
+ .else
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+ .endm
+
+ generate_composite_function \
+ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+@@ -999,33 +1004,33 @@ 20: /* No simplifications possible -
+ uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
+ 30: /* The destination buffer is already in the L1 cache, so
+ * there's little point in amalgamating writes */
+ pixst , 4, 0, DST
+ 40:
+ .endm
+
+ .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- .rept (numbytes / 4) - 1
++ .rept (\numbytes / 4) - 1
+ over_n_8888_8888_ca_1pixel_head
+ over_n_8888_8888_ca_1pixel_tail
+ .endr
+ over_n_8888_8888_ca_1pixel_head
+ .endm
+
+ .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
+ over_n_8888_8888_ca_1pixel_tail
+ .endm
+
+ pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+ ldr ip, [sp]
+ cmp ip, #-1
+ beq pixman_composite_over_white_8888_8888_ca_asm_armv6
+ /* else drop through... */
+- .endfunc
++ pixman_end_asm_function
+ generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+ 2, /* prefetch distance */ \
+ over_n_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_n_8888_8888_ca_cleanup, \
+ over_n_8888_8888_ca_process_head, \
+@@ -1040,94 +1045,94 @@ generate_composite_function \
+ uadd8 SCRATCH, MASK, MASK
+ /* Offset the source pointer: we only need the alpha bytes */
+ add SRC, SRC, #3
+ line_saved_regs ORIG_W
+ .endm
+
+ .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
+ ldrb ORIG_W, [SRC], #4
+- .if numbytes >= 8
+- ldrb WK&reg1, [SRC], #4
+- .if numbytes == 16
+- ldrb WK&reg2, [SRC], #4
+- ldrb WK&reg3, [SRC], #4
++ .if \numbytes >= 8
++ ldrb WK\()\reg1, [SRC], #4
++ .if \numbytes == 16
++ ldrb WK\()\reg2, [SRC], #4
++ ldrb WK\()\reg3, [SRC], #4
+ .endif
+ .endif
+- add DST, DST, #numbytes
++ add DST, DST, #\numbytes
+ .endm
+
+ .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
++ in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
+ .endm
+
+ .macro in_reverse_8888_8888_1pixel s, d, offset, is_only
+- .if is_only != 1
+- movs s, ORIG_W
+- .if offset != 0
+- ldrb ORIG_W, [SRC, #offset]
++ .if \is_only != 1
++ movs \s, ORIG_W
++ .if \offset != 0
++ ldrb ORIG_W, [SRC, #\offset]
+ .endif
+ beq 01f
+ teq STRIDE_M, #0xFF
+ beq 02f
+ .endif
+- uxtb16 SCRATCH, d /* rb_dest */
+- uxtb16 d, d, ror #8 /* ag_dest */
+- mla SCRATCH, SCRATCH, s, MASK
+- mla d, d, s, MASK
++ uxtb16 SCRATCH, \d /* rb_dest */
++ uxtb16 \d, \d, ror #8 /* ag_dest */
++ mla SCRATCH, SCRATCH, \s, MASK
++ mla \d, \d, \s, MASK
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+- uxtab16 d, d, d, ror #8
++ uxtab16 \d, \d, \d, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+- sel d, SCRATCH, d
++ sel \d, SCRATCH, \d
+ b 02f
+- .if offset == 0
++ .if \offset == 0
+ 48: /* Last mov d,#0 of the set - used as part of shortcut for
+ * source values all 0 */
+ .endif
+-01: mov d, #0
++01: mov \d, #0
+ 02:
+ .endm
+
+ .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
+- .if numbytes == 4
++ .if \numbytes == 4
+ teq ORIG_W, ORIG_W, asr #32
+- ldrne WK&reg1, [DST, #-4]
+- .elseif numbytes == 8
+- teq ORIG_W, WK&reg1
++ ldrne WK\()\reg1, [DST, #-4]
++ .elseif \numbytes == 8
++ teq ORIG_W, WK\()\reg1
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+- ldmnedb DST, {WK&reg1-WK&reg2}
++ ldmnedb DST, {WK\()\reg1-WK\()\reg2}
+ .else
+- teq ORIG_W, WK&reg1
+- teqeq ORIG_W, WK&reg2
+- teqeq ORIG_W, WK&reg3
++ teq ORIG_W, WK\()\reg1
++ teqeq ORIG_W, WK\()\reg2
++ teqeq ORIG_W, WK\()\reg3
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+- ldmnedb DST, {WK&reg1-WK&reg4}
++ ldmnedb DST, {WK\()\reg1-WK\()\reg4}
+ .endif
+ cmnne DST, #0 /* clear C if NE */
+ bcs 49f /* no writes to dest if source all -1 */
+ beq 48f /* set dest to all 0 if source all 0 */
+- .if numbytes == 4
+- in_reverse_8888_8888_1pixel ORIG_W, WK&reg1, 0, 1
+- str WK&reg1, [DST, #-4]
+- .elseif numbytes == 8
+- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -4, 0
+- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, 0, 0
+- stmdb DST, {WK&reg1-WK&reg2}
++ .if \numbytes == 4
++ in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1
++ str WK\()\reg1, [DST, #-4]
++ .elseif \numbytes == 8
++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0
++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0
++ stmdb DST, {WK\()\reg1-WK\()\reg2}
+ .else
+- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -12, 0
+- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, -8, 0
+- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg3, -4, 0
+- in_reverse_8888_8888_1pixel STRIDE_M, WK&reg4, 0, 0
+- stmdb DST, {WK&reg1-WK&reg4}
++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0
++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0
++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0
++ in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0
++ stmdb DST, {WK\()\reg1-WK\()\reg4}
+ .endif
+ 49:
+ .endm
+
+ .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
+- in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
++ in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
+ .endm
+
+ generate_composite_function \
+ pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
+ 2, /* prefetch distance */ \
+ in_reverse_8888_8888_init, \
+ nop_macro, /* newline */ \
+@@ -1144,31 +1149,31 @@ generate_composite_function \
+ /* Hold multiplier for destination in STRIDE_M */
+ mov STRIDE_M, #255
+ sub STRIDE_M, STRIDE_M, SRC, lsr #24
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ .endm
+
+ .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+- pixld , numbytes, firstreg, DST, 0
++ pixld , \numbytes, \firstreg, DST, 0
+ .endm
+
+ .macro over_n_8888_1pixel dst
+- mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK
+- uqadd8 WK&dst, WK&dst, SRC
++ mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK
++ uqadd8 WK\()\dst, WK\()\dst, SRC
+ .endm
+
+ .macro over_n_8888_process_tail cond, numbytes, firstreg
+- .set PROCESS_REG, firstreg
+- .rept numbytes / 4
++ .set PROCESS_REG, \firstreg
++ .rept \numbytes / 4
+ over_n_8888_1pixel %(PROCESS_REG)
+ .set PROCESS_REG, PROCESS_REG+1
+ .endr
+- pixst , numbytes, firstreg, DST
++ pixst , \numbytes, \firstreg, DST
+ .endm
+
+ generate_composite_function \
+ pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
+ 2, /* prefetch distance */ \
+ over_n_8888_init, \
+ nop_macro, /* newline */ \
+diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
+--- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
++++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
+@@ -107,88 +107,120 @@
+ .set PREFETCH_TYPE_NONE, 0
+ .set PREFETCH_TYPE_STANDARD, 1
+
+ /*
+ * Definitions of macros for load/store of pixel data.
+ */
+
+ .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
+- .if numbytes == 16
+- .if unaligned == 1
+- op&r&cond WK&reg0, [base], #4
+- op&r&cond WK&reg1, [base], #4
+- op&r&cond WK&reg2, [base], #4
+- op&r&cond WK&reg3, [base], #4
++ .if \numbytes == 16
++ .if \unaligned == 1
++ \op\()r\()\cond WK\()\reg0, [\base], #4
++ \op\()r\()\cond WK\()\reg1, [\base], #4
++ \op\()r\()\cond WK\()\reg2, [\base], #4
++ \op\()r\()\cond WK\()\reg3, [\base], #4
+ .else
+- op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
++#ifdef __clang__
++ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
++#else
++ \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
++#endif
+ .endif
+- .elseif numbytes == 8
+- .if unaligned == 1
+- op&r&cond WK&reg0, [base], #4
+- op&r&cond WK&reg1, [base], #4
++ .elseif \numbytes == 8
++ .if \unaligned == 1
++ \op\()r\()\cond WK\()\reg0, [\base], #4
++ \op\()r\()\cond WK\()\reg1, [\base], #4
+ .else
+- op&m&cond&ia base!, {WK&reg0,WK&reg1}
++#ifdef __clang__
++ \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
++#else
++ \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1}
++#endif
+ .endif
+- .elseif numbytes == 4
+- op&r&cond WK&reg0, [base], #4
+- .elseif numbytes == 2
+- op&r&cond&h WK&reg0, [base], #2
+- .elseif numbytes == 1
+- op&r&cond&b WK&reg0, [base], #1
++ .elseif \numbytes == 4
++ \op\()r\()\cond WK\()\reg0, [\base], #4
++ .elseif \numbytes == 2
++#ifdef __clang__
++ \op\()rh\()\cond WK\()\reg0, [\base], #2
++#else
++ \op\()r\()\cond\()h WK\()\reg0, [\base], #2
++#endif
++ .elseif \numbytes == 1
++#ifdef __clang__
++ \op\()rb\()\cond WK\()\reg0, [\base], #1
++#else
++ \op\()r\()\cond\()b WK\()\reg0, [\base], #1
++#endif
+ .else
+- .error "unsupported size: numbytes"
++ .error "unsupported size: \numbytes"
+ .endif
+ .endm
+
+ .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
+- .if numbytes == 16
+- stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+- .elseif numbytes == 8
+- stm&cond&db base, {WK&reg0,WK&reg1}
+- .elseif numbytes == 4
+- str&cond WK&reg0, [base, #-4]
+- .elseif numbytes == 2
+- str&cond&h WK&reg0, [base, #-2]
+- .elseif numbytes == 1
+- str&cond&b WK&reg0, [base, #-1]
++ .if \numbytes == 16
++#ifdef __clang__
++ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
++#else
++ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
++#endif
++ .elseif \numbytes == 8
++#ifdef __clang__
++ stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
++#else
++ stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}
++#endif
++ .elseif \numbytes == 4
++ str\()\cond WK\()\reg0, [\base, #-4]
++ .elseif \numbytes == 2
++#ifdef __clang__
++ strh\()\cond WK\()\reg0, [\base, #-2]
++#else
++ str\()\cond\()h WK\()\reg0, [\base, #-2]
++#endif
++ .elseif \numbytes == 1
++#ifdef __clang__
++ strb\()\cond WK\()\reg0, [\base, #-1]
++#else
++ str\()\cond\()b WK\()\reg0, [\base, #-1]
++#endif
+ .else
+- .error "unsupported size: numbytes"
++ .error "unsupported size: \numbytes"
+ .endif
+ .endm
+
+ .macro pixld cond, numbytes, firstreg, base, unaligned
+- pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
++ pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
+ .endm
+
+ .macro pixst cond, numbytes, firstreg, base
+ .if (flags) & FLAG_DST_READWRITE
+- pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
++ pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
+ .else
+- pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
++ pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
+ .endif
+ .endm
+
+ .macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
+- a x
++ \a \x
+ .endif
+ .endm
+
+
+ .macro preload_leading_step1 bpp, ptr, base
+ /* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+- .if bpp > 0
+- PF bic, ptr, base, #31
++ .if \bpp > 0
++ PF bic, \ptr, \base, #31
+ .set OFFSET, 0
+ .rept prefetch_distance+1
+- PF pld, [ptr, #OFFSET]
++ PF pld, [\ptr, #OFFSET]
+ .set OFFSET, OFFSET+32
+ .endr
+ .endif
+ .endm
+
+ .macro preload_leading_step2 bpp, bpp_shift, ptr, base
+ /* However, if the destination is not 16-byte aligned, we may need to
+ * preload more cache lines than that. The question we need to ask is:
+@@ -196,81 +228,81 @@
+ * by which the source pointer will be rounded down for preloading, and if
+ * so, by how many cache lines? Effectively, we want to calculate
+ * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
+ * inner_loop_offset = (src+leading_bytes)&31
+ * extra_needed = leading_bytes - inner_loop_offset
+ * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
+ * possible when there are 4 src bytes for every 1 dst byte).
+ */
+- .if bpp > 0
+- .ifc base,DST
++ .if \bpp > 0
++ .ifc \base,DST
+ /* The test can be simplified further when preloading the destination */
+- PF tst, base, #16
++ PF tst, \base, #16
+ PF beq, 61f
+ .else
+- .if bpp/dst_w_bpp == 4
+- PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
++ .if \bpp/dst_w_bpp == 4
++ PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
+ PF and, SCRATCH, SCRATCH, #31
+- PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
++ PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
+ PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
+ PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
+ PF bcs, 61f
+ PF bpl, 60f
+ PF pld, [ptr, #32*(prefetch_distance+2)]
+ .else
+- PF mov, SCRATCH, base, lsl #32-5
+- PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+- PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
++ PF mov, SCRATCH, \base, lsl #32-5
++ PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
++ PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
+ PF bls, 61f
+ .endif
+ .endif
+-60: PF pld, [ptr, #32*(prefetch_distance+1)]
++60: PF pld, [\ptr, #32*(prefetch_distance+1)]
+ 61:
+ .endif
+ .endm
+
+ #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
+ .macro preload_middle bpp, base, scratch_holds_offset
+- .if bpp > 0
++ .if \bpp > 0
+ /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
+- .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
+- .if scratch_holds_offset
+- PF pld, [base, SCRATCH]
++ .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
++ .if \scratch_holds_offset
++ PF pld, [\base, SCRATCH]
+ .else
+- PF bic, SCRATCH, base, #31
++ PF bic, SCRATCH, \base, #31
+ PF pld, [SCRATCH, #32*prefetch_distance]
+ .endif
+ .endif
+ .endif
+ .endm
+
+ .macro preload_trailing bpp, bpp_shift, base
+- .if bpp > 0
+- .if bpp*pix_per_block > 256
++ .if \bpp > 0
++ .if \bpp*pix_per_block > 256
+ /* Calculations are more complex if more than one fetch per block */
+- PF and, WK1, base, #31
+- PF add, WK1, WK1, WK0, lsl #bpp_shift
+- PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
+- PF bic, SCRATCH, base, #31
++ PF and, WK1, \base, #31
++ PF add, WK1, WK1, WK0, lsl #\bpp_shift
++ PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
++ PF bic, SCRATCH, \base, #31
+ 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
+ PF add, SCRATCH, SCRATCH, #32
+ PF subs, WK1, WK1, #32
+ PF bhi, 80b
+ .else
+ /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
+- PF mov, SCRATCH, base, lsl #32-5
+- PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
++ PF mov, SCRATCH, \base, lsl #32-5
++ PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
+ PF adceqs, SCRATCH, SCRATCH, #0
+ /* The instruction above has two effects: ensures Z is only
+ * set if C was clear (so Z indicates that both shifted quantities
+ * were 0), and clears C if Z was set (so C indicates that the sum
+ * of the shifted quantities was greater and not equal to 32) */
+ PF beq, 82f
+- PF bic, SCRATCH, base, #31
++ PF bic, SCRATCH, \base, #31
+ PF bcc, 81f
+ PF pld, [SCRATCH, #32*(prefetch_distance+2)]
+ 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
+ 82:
+ .endif
+ .endif
+ .endm
+
+@@ -283,97 +315,97 @@ 82:
+ * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
+ * meaning there's no need for a loop.
+ * "bpp" - number of bits per pixel in the channel (source, mask or
+ * destination) that's being preloaded, or 0 if this channel is not used
+ * for reading
+ * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
+ * "base" - base address register of channel to preload (SRC, MASK or DST)
+ */
+- .if bpp > 0
+- .if narrow_case && (bpp <= dst_w_bpp)
++ .if \bpp > 0
++ .if \narrow_case && (\bpp <= dst_w_bpp)
+ /* In these cases, each line for each channel is in either 1 or 2 cache lines */
+- PF bic, WK0, base, #31
++ PF bic, WK0, \base, #31
+ PF pld, [WK0]
+- PF add, WK1, base, X, LSL #bpp_shift
++ PF add, WK1, \base, X, LSL #\bpp_shift
+ PF sub, WK1, WK1, #1
+ PF bic, WK1, WK1, #31
+ PF cmp, WK1, WK0
+ PF beq, 90f
+ PF pld, [WK1]
+ 90:
+ .else
+- PF bic, WK0, base, #31
++ PF bic, WK0, \base, #31
+ PF pld, [WK0]
+- PF add, WK1, base, X, lsl #bpp_shift
++ PF add, WK1, \base, X, lsl #\bpp_shift
+ PF sub, WK1, WK1, #1
+ PF bic, WK1, WK1, #31
+ PF cmp, WK1, WK0
+ PF beq, 92f
+ 91: PF add, WK0, WK0, #32
+ PF cmp, WK0, WK1
+ PF pld, [WK0]
+ PF bne, 91b
+ 92:
+ .endif
+ .endif
+ .endm
+
+
+ .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+- process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
+- .if decrementx
+- sub&cond X, X, #8*numbytes/dst_w_bpp
++ \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
++ .if \decrementx
++ sub\()\cond X, X, #8*\numbytes/dst_w_bpp
+ .endif
+- process_tail cond, numbytes, firstreg
++ \process_tail \cond, \numbytes, \firstreg
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+- pixst cond, numbytes, firstreg, DST
++ pixst \cond, \numbytes, \firstreg, DST
+ .endif
+ .endm
+
+ .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_BRANCH_OVER
+- .ifc cond,mi
++ .ifc \cond,mi
+ bpl 100f
+ .endif
+- .ifc cond,cs
++ .ifc \cond,cs
+ bcc 100f
+ .endif
+- .ifc cond,ne
++ .ifc \cond,ne
+ beq 100f
+ .endif
+- conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
++ conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
+ 100:
+ .else
+- conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
++ conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
+ .endif
+ .endm
+
+ .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
+ /* Can't interleave reads and writes */
+- test
+- conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
++ \test
++ conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
+ .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
+- test
++ \test
+ .endif
+- conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
++ conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
+ .else
+ /* Can interleave reads and writes for better scheduling */
+- test
+- process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
+- process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
+- .if decrementx
+- sub&cond1 X, X, #8*numbytes1/dst_w_bpp
+- sub&cond2 X, X, #8*numbytes2/dst_w_bpp
++ \test
++ \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
++ \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
++ .if \decrementx
++ sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
++ sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
+ .endif
+- process_tail cond1, numbytes1, firstreg1
+- process_tail cond2, numbytes2, firstreg2
+- pixst cond1, numbytes1, firstreg1, DST
+- pixst cond2, numbytes2, firstreg2, DST
++ \process_tail \cond1, \numbytes1, \firstreg1
++ \process_tail \cond2, \numbytes2, \firstreg2
++ pixst \cond1, \numbytes1, \firstreg1, DST
++ pixst \cond2, \numbytes2, \firstreg2, DST
+ .endif
+ .endm
+
+
+ .macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .else
+@@ -395,22 +427,22 @@ 100:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set DECREMENT_X, 0
+ sub X, X, WK0, lsr #dst_bpp_shift
+ str X, [sp, #LINE_SAVED_REG_COUNT*4]
+ mov X, WK0
+ .endif
+ /* Use unaligned loads in all cases for simplicity */
+ .if dst_w_bpp == 8
+- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
++ conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
+ .elseif dst_w_bpp == 16
+ test_bits_1_0_ptr
+- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
++ conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
++ conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
+ .endif
+ .endm
+
+ .macro test_bits_3_2_pix
+ movs SCRATCH, X, lsl #dst_bpp_shift+32-3
+ .endm
+@@ -419,169 +451,169 @@ 100:
+ .if dst_w_bpp == 8
+ movs SCRATCH, X, lsl #dst_bpp_shift+32-1
+ .else
+ movs SCRATCH, X, lsr #1
+ .endif
+ .endm
+
+ .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
+- conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
++ conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
+ .if dst_w_bpp == 16
+ test_bits_1_0_pix
+- conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
++ conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
+ .elseif dst_w_bpp == 8
+- conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
++ conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
+ .endif
+ .endm
+
+
+ .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+ 110:
+ .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
+ .rept pix_per_block*dst_w_bpp/128
+- process_head , 16, 0, unaligned_src, unaligned_mask, 1
++ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ preload_middle src_bpp, SRC, 1
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ preload_middle mask_bpp, MASK, 1
+ .else
+ preload_middle src_bpp, SRC, 0
+ preload_middle mask_bpp, MASK, 0
+ .endif
+ .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
+ /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
+ * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
+ * preloads for, to achieve staggered prefetches for multiple channels, because there are
+ * always two STMs per prefetch, so there is always an opposite STM on which to put the
+ * preload. Note, no need to BIC the base register here */
+- PF pld, [DST, #32*prefetch_distance - dst_alignment]
++ PF pld, [DST, #32*prefetch_distance - \dst_alignment]
+ .endif
+- process_tail , 16, 0
++ \process_tail , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 16, 0, DST
+ .endif
+ .set SUBBLOCK, SUBBLOCK+1
+ .endr
+ subs X, X, #pix_per_block
+ bhs 110b
+ .endm
+
+ .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
+ /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
+ .if dst_r_bpp > 0
+ tst DST, #16
+ bne 111f
+- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
++ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
+ b 112f
+ 111:
+ .endif
+- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
++ \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
+ 112:
+ /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
+ PF and, WK0, X, #pix_per_block-1
+ .endif
+ preload_trailing src_bpp, src_bpp_shift, SRC
+ preload_trailing mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_trailing dst_r_bpp, dst_bpp_shift, DST
+ .endif
+ add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
+ /* The remainder of the line is handled identically to the medium case */
+- medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
++ medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
+ .endm
+
+ .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+ 120:
+- process_head , 16, 0, unaligned_src, unaligned_mask, 0
+- process_tail , 16, 0
++ \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0
++ \process_tail , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 16, 0, DST
+ .endif
+ subs X, X, #128/dst_w_bpp
+ bhs 120b
+ /* Trailing pixels */
+ tst X, #128/dst_w_bpp - 1
+- beq exit_label
+- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
++ beq \exit_label
++ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
+ .endm
+
+ .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+ tst X, #16*8/dst_w_bpp
+- conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
++ conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
+ /* Trailing pixels */
+ /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
+- trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
++ trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
+ .endm
+
+ .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
+ /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
+ .if mask_bpp == 8 || mask_bpp == 16
+ tst MASK, #3
+ bne 141f
+ .endif
+ .if src_bpp == 8 || src_bpp == 16
+ tst SRC, #3
+ bne 140f
+ .endif
+- action process_head, process_tail, process_inner_loop, exit_label, 0, 0
++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
+ .if src_bpp == 8 || src_bpp == 16
+- b exit_label
++ b \exit_label
+ 140:
+- action process_head, process_tail, process_inner_loop, exit_label, 1, 0
++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
+ .endif
+ .if mask_bpp == 8 || mask_bpp == 16
+- b exit_label
++ b \exit_label
+ 141:
+ .if src_bpp == 8 || src_bpp == 16
+ tst SRC, #3
+ bne 142f
+ .endif
+- action process_head, process_tail, process_inner_loop, exit_label, 0, 1
++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
+ .if src_bpp == 8 || src_bpp == 16
+- b exit_label
++ b \exit_label
+ 142:
+- action process_head, process_tail, process_inner_loop, exit_label, 1, 1
++ \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
+ .endif
+ .endif
+ .endm
+
+
+ .macro end_of_line restore_x, vars_spilled, loop_label, last_one
+- .if vars_spilled
++ .if \vars_spilled
+ /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
+ /* This is ldmia sp,{} */
+ .word 0xE89D0000 | LINE_SAVED_REGS
+ .endif
+ subs Y, Y, #1
+- .if vars_spilled
++ .if \vars_spilled
+ .if (LINE_SAVED_REGS) & (1<<1)
+ str Y, [sp]
+ .endif
+ .endif
+ add DST, DST, STRIDE_D
+ .if src_bpp > 0
+ add SRC, SRC, STRIDE_S
+ .endif
+ .if mask_bpp > 0
+ add MASK, MASK, STRIDE_M
+ .endif
+- .if restore_x
++ .if \restore_x
+ mov X, ORIG_W
+ .endif
+- bhs loop_label
+- .ifc "last_one",""
+- .if vars_spilled
++ bhs \loop_label
++ .ifc "\last_one",""
++ .if \vars_spilled
+ b 197f
+ .else
+ b 198f
+ .endif
+ .else
+- .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
++ .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+ b 198f
+ .endif
+ .endif
+ .endm
+
+
+ .macro generate_composite_function fname, \
+ src_bpp_, \
+@@ -591,27 +623,27 @@ 142:
+ prefetch_distance_, \
+ init, \
+ newline, \
+ cleanup, \
+ process_head, \
+ process_tail, \
+ process_inner_loop
+
+- pixman_asm_function fname
++ pixman_asm_function \fname
+
+ /*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+- .set src_bpp, src_bpp_
+- .set mask_bpp, mask_bpp_
+- .set dst_w_bpp, dst_w_bpp_
+- .set flags, flags_
+- .set prefetch_distance, prefetch_distance_
++ .set src_bpp, \src_bpp_
++ .set mask_bpp, \mask_bpp_
++ .set dst_w_bpp, \dst_w_bpp_
++ .set flags, \flags_
++ .set prefetch_distance, \prefetch_distance_
+
+ /*
+ * Select prefetch type for this function.
+ */
+ .if prefetch_distance == 0
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .else
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
+@@ -727,17 +759,17 @@ 142:
+ .endif
+
+ #ifdef DEBUG_PARAMS
+ add Y, Y, #1
+ stmia sp, {r0-r7,pc}
+ sub Y, Y, #1
+ #endif
+
+- init
++ \init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ /* Reserve a word in which to store X during leading pixels */
+ sub sp, sp, #4
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
+
+@@ -768,47 +800,47 @@ 142:
+ mov ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .endif
+ 151: /* New line */
+- newline
++ \newline
+ preload_leading_step1 src_bpp, WK1, SRC
+ preload_leading_step1 mask_bpp, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_leading_step1 dst_r_bpp, WK3, DST
+ .endif
+
+ ands WK0, DST, #15
+ beq 154f
+ rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
+
+ preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
+ preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
+ .endif
+
+- leading_15bytes process_head, process_tail
++ leading_15bytes \process_head, \process_tail
+
+ 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ and SCRATCH, SRC, #31
+ rsb SCRATCH, SCRATCH, #32*prefetch_distance
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+ and SCRATCH, MASK, #31
+ rsb SCRATCH, SCRATCH, #32*prefetch_distance
+ .endif
+- .ifc "process_inner_loop",""
+- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
++ .ifc "\process_inner_loop",""
++ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
+ .else
+- switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
++ switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
+ .endif
+
+ 157: /* Check for another line */
+ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
+@@ -820,80 +852,80 @@ 160: /* Medium case */
+ mov ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .endif
+ 161: /* New line */
+- newline
++ \newline
+ preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
+ preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
+
+ sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
+ ands WK0, DST, #15
+ beq 164f
+ rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
+
+- leading_15bytes process_head, process_tail
++ leading_15bytes \process_head, \process_tail
+
+ 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+- switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
++ switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
+
+ 167: /* Check for another line */
+ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
+
+ .ltorg
+
+ 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if dst_w_bpp < 32
+ mov ORIG_W, X
+ .endif
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ /* This is stmdb sp!,{} */
+ .word 0xE92D0000 | LINE_SAVED_REGS
+ .endif
+ 171: /* New line */
+- newline
++ \newline
+ preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
+ preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
+ preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
+
+ .if dst_w_bpp == 8
+ tst DST, #3
+ beq 174f
+ 172: subs X, X, #1
+ blo 177f
+- process_head , 1, 0, 1, 1, 0
+- process_tail , 1, 0
++ \process_head , 1, 0, 1, 1, 0
++ \process_tail , 1, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 1, 0, DST
+ .endif
+ tst DST, #3
+ bne 172b
+ .elseif dst_w_bpp == 16
+ tst DST, #2
+ beq 174f
+ subs X, X, #1
+ blo 177f
+- process_head , 2, 0, 1, 1, 0
+- process_tail , 2, 0
++ \process_head , 2, 0, 1, 1, 0
++ \process_tail , 2, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+ pixst , 2, 0, DST
+ .endif
+ .endif
+
+ 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+- switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
++ switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
+
+ 177: /* Check for another line */
+ end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
+
+@@ -903,17 +935,17 @@ 197:
+ .endif
+ 198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+ add sp, sp, #4
+ .endif
+
+- cleanup
++ \cleanup
+
+ #ifdef DEBUG_PARAMS
+ add sp, sp, #9*4 /* junk the debug copy of arguments */
+ #endif
+ 199:
+ pop {r4-r11, pc} /* exit */
+
+ .ltorg
+@@ -927,23 +959,23 @@ 199:
+ .unreq MASK
+ .unreq STRIDE_M
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ .unreq SCRATCH
+ .unreq ORIG_W
+- .endfunc
++ pixman_end_asm_function
+ .endm
+
+ .macro line_saved_regs x:vararg
+ .set LINE_SAVED_REGS, 0
+ .set LINE_SAVED_REG_COUNT, 0
+- .irp SAVED_REG,x
++ .irp SAVED_REG,\x
+ .ifc "SAVED_REG","Y"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif
+ .ifc "SAVED_REG","STRIDE_D"
+ .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
+ .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+ .endif