summaryrefslogtreecommitdiffstats
path: root/gfx/cairo/pixman-arm64-clang.patch
diff options
context:
space:
mode:
Diffstat (limited to 'gfx/cairo/pixman-arm64-clang.patch')
-rw-r--r--gfx/cairo/pixman-arm64-clang.patch3756
1 files changed, 0 insertions, 3756 deletions
diff --git a/gfx/cairo/pixman-arm64-clang.patch b/gfx/cairo/pixman-arm64-clang.patch
deleted file mode 100644
index f059734531..0000000000
--- a/gfx/cairo/pixman-arm64-clang.patch
+++ /dev/null
@@ -1,3756 +0,0 @@
-https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71
-
-diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h
---- a/gfx/cairo/libpixman/src/pixman-arm-asm.h
-+++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h
-@@ -21,17 +21,33 @@
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author: Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-
- /* Supplementary macro for setting function attributes */
--.macro pixman_asm_function fname
-- .func fname
-- .global fname
-+.macro pixman_asm_function_impl fname
-+#ifdef ASM_HAVE_FUNC_DIRECTIVE
-+ .func \fname
-+#endif
-+ .global \fname
- #ifdef __ELF__
-- .hidden fname
-- .type fname, %function
-+ .hidden \fname
-+ .type \fname, %function
- #endif
--fname:
-+\fname:
- .endm
-+
-+.macro pixman_asm_function fname
-+#ifdef ASM_LEADING_UNDERSCORE
-+ pixman_asm_function_impl _\fname
-+#else
-+ pixman_asm_function_impl \fname
-+#endif
-+.endm
-+
-+.macro pixman_end_asm_function
-+#ifdef ASM_HAVE_FUNC_DIRECTIVE
-+ .endfunc
-+#endif
-+.endm
-diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
---- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
-+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
-@@ -72,219 +72,219 @@
- * format conversion, and interpolation as separate macros which can be used
- * as the basic building blocks for constructing bilinear scanline functions.
- */
-
- .macro bilinear_load_8888 reg1, reg2, tmp
- asr WTMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #2
-- ld1 {&reg1&.2s}, [TMP1], STRIDE
-- ld1 {&reg2&.2s}, [TMP1]
-+ ld1 {\()\reg1\().2s}, [TMP1], STRIDE
-+ ld1 {\()\reg2\().2s}, [TMP1]
- .endm
-
- .macro bilinear_load_0565 reg1, reg2, tmp
- asr WTMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
-- ld1 {&reg2&.s}[0], [TMP1], STRIDE
-- ld1 {&reg2&.s}[1], [TMP1]
-- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
-+ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
-+ ld1 {\()\reg2\().s}[1], [TMP1]
-+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_two_8888 \
- acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
-
-- bilinear_load_8888 reg1, reg2, tmp1
-- umull &acc1&.8h, &reg1&.8b, v28.8b
-- umlal &acc1&.8h, &reg2&.8b, v29.8b
-- bilinear_load_8888 reg3, reg4, tmp2
-- umull &acc2&.8h, &reg3&.8b, v28.8b
-- umlal &acc2&.8h, &reg4&.8b, v29.8b
-+ bilinear_load_8888 \reg1, \reg2, \tmp1
-+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
-+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
-+ bilinear_load_8888 \reg3, \reg4, \tmp2
-+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
-+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_four_8888 \
-- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
- bilinear_load_and_vertical_interpolate_two_8888 \
-- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
-+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi
- bilinear_load_and_vertical_interpolate_two_8888 \
-- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
- .endm
-
- .macro vzip reg1, reg2
-- zip1 v24.8b, reg1, reg2
-- zip2 reg2, reg1, reg2
-- mov reg1, v24.8b
-+ zip1 v24.8b, \reg1, \reg2
-+ zip2 \reg2, \reg1, \reg2
-+ mov \reg1, v24.8b
- .endm
-
- .macro vuzp reg1, reg2
-- uzp1 v24.8b, reg1, reg2
-- uzp2 reg2, reg1, reg2
-- mov reg1, v24.8b
-+ uzp1 v24.8b, \reg1, \reg2
-+ uzp2 \reg2, \reg1, \reg2
-+ mov \reg1, v24.8b
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_two_0565 \
- acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
- asr WTMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
- asr WTMP2, X, #16
- add X, X, UX
- add TMP2, TOP, TMP2, lsl #1
-- ld1 {&acc2&.s}[0], [TMP1], STRIDE
-- ld1 {&acc2&.s}[2], [TMP2], STRIDE
-- ld1 {&acc2&.s}[1], [TMP1]
-- ld1 {&acc2&.s}[3], [TMP2]
-- convert_0565_to_x888 acc2, reg3, reg2, reg1
-- vzip &reg1&.8b, &reg3&.8b
-- vzip &reg2&.8b, &reg4&.8b
-- vzip &reg3&.8b, &reg4&.8b
-- vzip &reg1&.8b, &reg2&.8b
-- umull &acc1&.8h, &reg1&.8b, v28.8b
-- umlal &acc1&.8h, &reg2&.8b, v29.8b
-- umull &acc2&.8h, &reg3&.8b, v28.8b
-- umlal &acc2&.8h, &reg4&.8b, v29.8b
-+ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
-+ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
-+ ld1 {\()\acc2\().s}[1], [TMP1]
-+ ld1 {\()\acc2\().s}[3], [TMP2]
-+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
-+ vzip \()\reg1\().8b, \()\reg3\().8b
-+ vzip \()\reg2\().8b, \()\reg4\().8b
-+ vzip \()\reg3\().8b, \()\reg4\().8b
-+ vzip \()\reg1\().8b, \()\reg2\().8b
-+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
-+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
-+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
-+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_four_0565 \
-- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
- asr WTMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
- asr WTMP2, X, #16
- add X, X, UX
- add TMP2, TOP, TMP2, lsl #1
-- ld1 {&xacc2&.s}[0], [TMP1], STRIDE
-- ld1 {&xacc2&.s}[2], [TMP2], STRIDE
-- ld1 {&xacc2&.s}[1], [TMP1]
-- ld1 {&xacc2&.s}[3], [TMP2]
-- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
-+ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
-+ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
-+ ld1 {\()\xacc2\().s}[1], [TMP1]
-+ ld1 {\()\xacc2\().s}[3], [TMP2]
-+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
- asr WTMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
- asr WTMP2, X, #16
- add X, X, UX
- add TMP2, TOP, TMP2, lsl #1
-- ld1 {&yacc2&.s}[0], [TMP1], STRIDE
-- vzip &xreg1&.8b, &xreg3&.8b
-- ld1 {&yacc2&.s}[2], [TMP2], STRIDE
-- vzip &xreg2&.8b, &xreg4&.8b
-- ld1 {&yacc2&.s}[1], [TMP1]
-- vzip &xreg3&.8b, &xreg4&.8b
-- ld1 {&yacc2&.s}[3], [TMP2]
-- vzip &xreg1&.8b, &xreg2&.8b
-- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-- umull &xacc1&.8h, &xreg1&.8b, v28.8b
-- vzip &yreg1&.8b, &yreg3&.8b
-- umlal &xacc1&.8h, &xreg2&.8b, v29.8b
-- vzip &yreg2&.8b, &yreg4&.8b
-- umull &xacc2&.8h, &xreg3&.8b, v28.8b
-- vzip &yreg3&.8b, &yreg4&.8b
-- umlal &xacc2&.8h, &xreg4&.8b, v29.8b
-- vzip &yreg1&.8b, &yreg2&.8b
-- umull &yacc1&.8h, &yreg1&.8b, v28.8b
-- umlal &yacc1&.8h, &yreg2&.8b, v29.8b
-- umull &yacc2&.8h, &yreg3&.8b, v28.8b
-- umlal &yacc2&.8h, &yreg4&.8b, v29.8b
-+ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
-+ vzip \()\xreg1\().8b, \()\xreg3\().8b
-+ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
-+ vzip \()\xreg2\().8b, \()\xreg4\().8b
-+ ld1 {\()\yacc2\().s}[1], [TMP1]
-+ vzip \()\xreg3\().8b, \()\xreg4\().8b
-+ ld1 {\()\yacc2\().s}[3], [TMP2]
-+ vzip \()\xreg1\().8b, \()\xreg2\().8b
-+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
-+ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
-+ vzip \()\yreg1\().8b, \()\yreg3\().8b
-+ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
-+ vzip \()\yreg2\().8b, \()\yreg4\().8b
-+ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
-+ vzip \()\yreg3\().8b, \()\yreg4\().8b
-+ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
-+ vzip \()\yreg1\().8b, \()\yreg2\().8b
-+ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
-+ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
-+ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
-+ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
- .endm
-
- .macro bilinear_store_8888 numpix, tmp1, tmp2
--.if numpix == 4
-+.if \numpix == 4
- st1 {v0.2s, v1.2s}, [OUT], #16
--.elseif numpix == 2
-+.elseif \numpix == 2
- st1 {v0.2s}, [OUT], #8
--.elseif numpix == 1
-+.elseif \numpix == 1
- st1 {v0.s}[0], [OUT], #4
- .else
-- .error bilinear_store_8888 numpix is unsupported
-+ .error bilinear_store_8888 \numpix is unsupported
- .endif
- .endm
-
- .macro bilinear_store_0565 numpix, tmp1, tmp2
- vuzp v0.8b, v1.8b
- vuzp v2.8b, v3.8b
- vuzp v1.8b, v3.8b
- vuzp v0.8b, v2.8b
-- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
--.if numpix == 4
-+ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
-+.if \numpix == 4
- st1 {v1.4h}, [OUT], #8
--.elseif numpix == 2
-+.elseif \numpix == 2
- st1 {v1.s}[0], [OUT], #4
--.elseif numpix == 1
-+.elseif \numpix == 1
- st1 {v1.h}[0], [OUT], #2
- .else
-- .error bilinear_store_0565 numpix is unsupported
-+ .error bilinear_store_0565 \numpix is unsupported
- .endif
- .endm
-
-
- /*
- * Macros for loading mask pixels into register 'mask'.
- * dup must be done in somewhere else.
- */
- .macro bilinear_load_mask_x numpix, mask
- .endm
-
- .macro bilinear_load_mask_8 numpix, mask
--.if numpix == 4
-- ld1 {&mask&.s}[0], [MASK], #4
--.elseif numpix == 2
-- ld1 {&mask&.h}[0], [MASK], #2
--.elseif numpix == 1
-- ld1 {&mask&.b}[0], [MASK], #1
-+.if \numpix == 4
-+ ld1 {\()\mask\().s}[0], [MASK], #4
-+.elseif \numpix == 2
-+ ld1 {\()\mask\().h}[0], [MASK], #2
-+.elseif \numpix == 1
-+ ld1 {\()\mask\().b}[0], [MASK], #1
- .else
-- .error bilinear_load_mask_8 numpix is unsupported
-+ .error bilinear_load_mask_8 \numpix is unsupported
- .endif
-- prfm PREFETCH_MODE, [MASK, #prefetch_offset]
-+ prfum PREFETCH_MODE, [MASK, #(prefetch_offset)]
- .endm
-
- .macro bilinear_load_mask mask_fmt, numpix, mask
-- bilinear_load_mask_&mask_fmt numpix, mask
-+ bilinear_load_mask_\mask_fmt \numpix, \mask
- .endm
-
-
- /*
- * Macros for loading destination pixels into register 'dst0' and 'dst1'.
- * Interleave should be done somewhere else.
- */
- .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
- .endm
-
- .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
- .endm
-
- .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
--.if numpix == 4
-- ld1 {&dst0&.2s, &dst1&.2s}, [OUT]
--.elseif numpix == 2
-- ld1 {&dst0&.2s}, [OUT]
--.elseif numpix == 1
-- ld1 {&dst0&.s}[0], [OUT]
-+.if \numpix == 4
-+ ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT]
-+.elseif \numpix == 2
-+ ld1 {\()\dst0\().2s}, [OUT]
-+.elseif \numpix == 1
-+ ld1 {\()\dst0\().s}[0], [OUT]
- .else
-- .error bilinear_load_dst_8888 numpix is unsupported
-+ .error bilinear_load_dst_8888 \numpix is unsupported
- .endif
-- mov &dst01&.d[0], &dst0&.d[0]
-- mov &dst01&.d[1], &dst1&.d[0]
-+ mov \()\dst01\().d[0], \()\dst0\().d[0]
-+ mov \()\dst01\().d[1], \()\dst1\().d[0]
- prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
- .endm
-
- .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
-- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
-+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
-- bilinear_load_dst_8888 numpix, dst0, dst1, dst01
-+ bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
-- bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
-+ bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
- .endm
-
- /*
- * Macros for duplicating partially loaded mask to fill entire register.
- * We will apply mask to interleaved source pixels, that is
- * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
- * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
- * So, we need to duplicate loaded mask into whole register.
-@@ -293,84 +293,85 @@
- * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
- * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
- * We can do some optimizations for this including last pixel cases.
- */
- .macro bilinear_duplicate_mask_x numpix, mask
- .endm
-
- .macro bilinear_duplicate_mask_8 numpix, mask
--.if numpix == 4
-- dup &mask&.2s, &mask&.s[0]
--.elseif numpix == 2
-- dup &mask&.4h, &mask&.h[0]
--.elseif numpix == 1
-- dup &mask&.8b, &mask&.b[0]
-+.if \numpix == 4
-+ dup \()\mask\().2s, \()\mask\().s[0]
-+.elseif \numpix == 2
-+ dup \()\mask\().4h, \()\mask\().h[0]
-+.elseif \numpix == 1
-+ dup \()\mask\().8b, \()\mask\().b[0]
- .else
-- .error bilinear_duplicate_mask_8 is unsupported
-+ .error bilinear_duplicate_\mask_8 is unsupported
- .endif
- .endm
-
- .macro bilinear_duplicate_mask mask_fmt, numpix, mask
-- bilinear_duplicate_mask_&mask_fmt numpix, mask
-+ bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
- .endm
-
- /*
- * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
- * Interleave should be done when maks is enabled or operator is 'over'.
- */
- .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
-- vuzp &src0&.8b, &src1&.8b
-- vuzp &dst0&.8b, &dst1&.8b
-- vuzp &src0&.8b, &src1&.8b
-- vuzp &dst0&.8b, &dst1&.8b
-- mov &src01&.d[1], &src1&.d[0]
-- mov &src01&.d[0], &src0&.d[0]
-- mov &dst01&.d[1], &dst1&.d[0]
-- mov &dst01&.d[0], &dst0&.d[0]
-+ vuzp \()\src0\().8b, \()\src1\().8b
-+ vuzp \()\dst0\().8b, \()\dst1\().8b
-+ vuzp \()\src0\().8b, \()\src1\().8b
-+ vuzp \()\dst0\().8b, \()\dst1\().8b
-+ mov \()\src01\().d[1], \()\src1\().d[0]
-+ mov \()\src01\().d[0], \()\src0\().d[0]
-+ mov \()\dst01\().d[1], \()\dst1\().d[0]
-+ mov \()\dst01\().d[0], \()\dst0\().d[0]
- .endm
-
- .macro bilinear_interleave_src_dst_x_src \
- numpix, src0, src1, src01, dst0, dst1, dst01
- .endm
-
- .macro bilinear_interleave_src_dst_x_over \
- numpix, src0, src1, src01, dst0, dst1, dst01
-
-- bilinear_interleave src0, src1, src01, dst0, dst1, dst01
-+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_interleave_src_dst_x_add \
- numpix, src0, src1, src01, dst0, dst1, dst01
-- bilinear_interleave src0, src1, src01, dst0, dst1, dst01
-+
-+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_interleave_src_dst_8_src \
- numpix, src0, src1, src01, dst0, dst1, dst01
-
-- bilinear_interleave src0, src1, src01, dst0, dst1, dst01
-+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_interleave_src_dst_8_over \
- numpix, src0, src1, src01, dst0, dst1, dst01
-
-- bilinear_interleave src0, src1, src01, dst0, dst1, dst01
-+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_interleave_src_dst_8_add \
- numpix, src0, src1, src01, dst0, dst1, dst01
-
-- bilinear_interleave src0, src1, src01, dst0, dst1, dst01
-+ bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_interleave_src_dst \
- mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
-
-- bilinear_interleave_src_dst_&mask_fmt&_&op \
-- numpix, src0, src1, src01, dst0, dst1, dst01
-+ bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
-+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
- .endm
-
-
- /*
- * Macros for applying masks to src pixels. (see combine_mask_u() function)
- * src, dst should be in interleaved form.
- * mask register should be in form (m0, m1, m2, m3).
- */
-@@ -378,191 +379,191 @@
- numpix, src0, src1, src01, mask, \
- tmp01, tmp23, tmp45, tmp67
- .endm
-
- .macro bilinear_apply_mask_to_src_8 \
- numpix, src0, src1, src01, mask, \
- tmp01, tmp23, tmp45, tmp67
-
-- umull &tmp01&.8h, &src0&.8b, &mask&.8b
-- umull &tmp23&.8h, &src1&.8b, &mask&.8b
-+ umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b
-+ umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b
- /* bubbles */
-- urshr &tmp45&.8h, &tmp01&.8h, #8
-- urshr &tmp67&.8h, &tmp23&.8h, #8
-+ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
-+ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
- /* bubbles */
-- raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h
-- raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h
-- mov &src01&.d[0], &src0&.d[0]
-- mov &src01&.d[1], &src1&.d[0]
-+ raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
-+ raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
-+ mov \()\src01\().d[0], \()\src0\().d[0]
-+ mov \()\src01\().d[1], \()\src1\().d[0]
- .endm
-
- .macro bilinear_apply_mask_to_src \
- mask_fmt, numpix, src0, src1, src01, mask, \
- tmp01, tmp23, tmp45, tmp67
-
-- bilinear_apply_mask_to_src_&mask_fmt \
-- numpix, src0, src1, src01, mask, \
-- tmp01, tmp23, tmp45, tmp67
-+ bilinear_apply_mask_to_src_\()\mask_fmt \
-+ \numpix, \src0, \src1, \src01, \mask, \
-+ \tmp01, \tmp23, \tmp45, \tmp67
- .endm
-
-
- /*
- * Macros for combining src and destination pixels.
- * Interleave or not is depending on operator 'op'.
- */
- .macro bilinear_combine_src \
- numpix, src0, src1, src01, dst0, dst1, dst01, \
- tmp01, tmp23, tmp45, tmp67, tmp8
- .endm
-
- .macro bilinear_combine_over \
- numpix, src0, src1, src01, dst0, dst1, dst01, \
- tmp01, tmp23, tmp45, tmp67, tmp8
-
-- dup &tmp8&.2s, &src1&.s[1]
-+ dup \()\tmp8\().2s, \()\src1\().s[1]
- /* bubbles */
-- mvn &tmp8&.8b, &tmp8&.8b
-+ mvn \()\tmp8\().8b, \()\tmp8\().8b
- /* bubbles */
-- umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b
-+ umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b
- /* bubbles */
-- umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b
-+ umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b
- /* bubbles */
-- urshr &tmp45&.8h, &tmp01&.8h, #8
-- urshr &tmp67&.8h, &tmp23&.8h, #8
-+ urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
-+ urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
- /* bubbles */
-- raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h
-- raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h
-- mov &dst01&.d[0], &dst0&.d[0]
-- mov &dst01&.d[1], &dst1&.d[0]
-+ raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
-+ raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
-+ mov \()\dst01\().d[0], \()\dst0\().d[0]
-+ mov \()\dst01\().d[1], \()\dst1\().d[0]
- /* bubbles */
-- uqadd &src0&.8b, &dst0&.8b, &src0&.8b
-- uqadd &src1&.8b, &dst1&.8b, &src1&.8b
-- mov &src01&.d[0], &src0&.d[0]
-- mov &src01&.d[1], &src1&.d[0]
-+ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
-+ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
-+ mov \()\src01\().d[0], \()\src0\().d[0]
-+ mov \()\src01\().d[1], \()\src1\().d[0]
- .endm
-
- .macro bilinear_combine_add \
- numpix, src0, src1, src01, dst0, dst1, dst01, \
- tmp01, tmp23, tmp45, tmp67, tmp8
-
-- uqadd &src0&.8b, &dst0&.8b, &src0&.8b
-- uqadd &src1&.8b, &dst1&.8b, &src1&.8b
-- mov &src01&.d[0], &src0&.d[0]
-- mov &src01&.d[1], &src1&.d[0]
-+ uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
-+ uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
-+ mov \()\src01\().d[0], \()\src0\().d[0]
-+ mov \()\src01\().d[1], \()\src1\().d[0]
- .endm
-
- .macro bilinear_combine \
- op, numpix, src0, src1, src01, dst0, dst1, dst01, \
- tmp01, tmp23, tmp45, tmp67, tmp8
-
-- bilinear_combine_&op \
-- numpix, src0, src1, src01, dst0, dst1, dst01, \
-- tmp01, tmp23, tmp45, tmp67, tmp8
-+ bilinear_combine_\()\op \
-+ \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
-+ \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
- .endm
-
- /*
- * Macros for final deinterleaving of destination pixels if needed.
- */
- .macro bilinear_deinterleave numpix, dst0, dst1, dst01
-- vuzp &dst0&.8b, &dst1&.8b
-+ vuzp \()\dst0\().8b, \()\dst1\().8b
- /* bubbles */
-- vuzp &dst0&.8b, &dst1&.8b
-- mov &dst01&.d[0], &dst0&.d[0]
-- mov &dst01&.d[1], &dst1&.d[0]
-+ vuzp \()\dst0\().8b, \()\dst1\().8b
-+ mov \()\dst01\().d[0], \()\dst0\().d[0]
-+ mov \()\dst01\().d[1], \()\dst1\().d[0]
- .endm
-
- .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
- .endm
-
- .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
-- bilinear_deinterleave numpix, dst0, dst1, dst01
-+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
-- bilinear_deinterleave numpix, dst0, dst1, dst01
-+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
-- bilinear_deinterleave numpix, dst0, dst1, dst01
-+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
-- bilinear_deinterleave numpix, dst0, dst1, dst01
-+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
-- bilinear_deinterleave numpix, dst0, dst1, dst01
-+ bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
- .endm
-
- .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
-- bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
-+ bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
- .endm
-
-
- .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
-- bilinear_load_&src_fmt v0, v1, v2
-- bilinear_load_mask mask_fmt, 1, v4
-- bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
-+ bilinear_load_\()\src_fmt v0, v1, v2
-+ bilinear_load_mask \mask_fmt, 1, v4
-+ bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9
- umull v2.8h, v0.8b, v28.8b
- umlal v2.8h, v1.8b, v29.8b
- /* 5 cycles bubble */
- ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v0.4s, v2.4h, v15.h[0]
- umlal2 v0.4s, v2.8h, v15.h[0]
- /* 5 cycles bubble */
-- bilinear_duplicate_mask mask_fmt, 1, v4
-+ bilinear_duplicate_mask \mask_fmt, 1, v4
- shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- /* 3 cycles bubble */
- xtn v0.8b, v0.8h
- /* 1 cycle bubble */
- bilinear_interleave_src_dst \
-- mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
-+ \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9
- bilinear_apply_mask_to_src \
-- mask_fmt, 1, v0, v1, v0, v4, \
-+ \mask_fmt, 1, v0, v1, v0, v4, \
- v3, v8, v10, v11
- bilinear_combine \
-- op, 1, v0, v1, v0, v18, v19, v9, \
-+ \op, 1, v0, v1, v0, v18, v19, v9, \
- v3, v8, v10, v11, v5
-- bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
-- bilinear_store_&dst_fmt 1, v17, v18
-+ bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0
-+ bilinear_store_\()\dst_fmt 1, v17, v18
- .endm
-
- .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
-- bilinear_load_and_vertical_interpolate_two_&src_fmt \
-+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
- v1, v11, v18, v19, v20, v21, v22, v23
-- bilinear_load_mask mask_fmt, 2, v4
-- bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
-+ bilinear_load_mask \mask_fmt, 2, v4
-+ bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9
- ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v0.4s, v1.4h, v15.h[0]
- umlal2 v0.4s, v1.8h, v15.h[0]
- ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v10.4s, v11.4h, v15.h[4]
- umlal2 v10.4s, v11.8h, v15.h[4]
- shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
-- bilinear_duplicate_mask mask_fmt, 2, v4
-+ bilinear_duplicate_mask \mask_fmt, 2, v4
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- add v12.8h, v12.8h, v13.8h
- xtn v0.8b, v0.8h
- bilinear_interleave_src_dst \
-- mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
-+ \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9
- bilinear_apply_mask_to_src \
-- mask_fmt, 2, v0, v1, v0, v4, \
-+ \mask_fmt, 2, v0, v1, v0, v4, \
- v3, v8, v10, v11
- bilinear_combine \
-- op, 2, v0, v1, v0, v18, v19, v9, \
-+ \op, 2, v0, v1, v0, v18, v19, v9, \
- v3, v8, v10, v11, v5
-- bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
-- bilinear_store_&dst_fmt 2, v16, v17
-+ bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0
-+ bilinear_store_\()\dst_fmt 2, v16, v17
- .endm
-
- .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
-- bilinear_load_and_vertical_interpolate_four_&src_fmt \
-- v1, v11, v4, v5, v6, v7, v22, v23 \
-+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
-+ v1, v11, v4, v5, v6, v7, v22, v23, \
- v3, v9, v16, v17, v20, v21, v18, v19
- prfm PREFETCH_MODE, [TMP1, PF_OFFS]
- sub TMP1, TMP1, STRIDE
- prfm PREFETCH_MODE, [TMP1, PF_OFFS]
- ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v0.4s, v1.4h, v15.h[0]
- umlal2 v0.4s, v1.8h, v15.h[0]
- ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
-@@ -575,33 +576,33 @@
- ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v8.4s, v9.4h, v15.h[4]
- umlal2 v8.4s, v9.8h, v15.h[4]
- add v12.8h, v12.8h, v13.8h
- shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
-- bilinear_load_mask mask_fmt, 4, v4
-- bilinear_duplicate_mask mask_fmt, 4, v4
-+ bilinear_load_mask \mask_fmt, 4, v4
-+ bilinear_duplicate_mask \mask_fmt, 4, v4
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- xtn v0.8b, v0.8h
- xtn v1.8b, v2.8h
- add v12.8h, v12.8h, v13.8h
-- bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
-+ bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21
- bilinear_interleave_src_dst \
-- mask_fmt, op, 4, v0, v1, v0, v2, v3, v11
-+ \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11
- bilinear_apply_mask_to_src \
-- mask_fmt, 4, v0, v1, v0, v4, \
-+ \mask_fmt, 4, v0, v1, v0, v4, \
- v6, v8, v9, v10
- bilinear_combine \
-- op, 4, v0, v1, v0, v2, v3, v1, \
-+ \op, 4, v0, v1, v0, v2, v3, v1, \
- v6, v8, v9, v10, v23
-- bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
-- bilinear_store_&dst_fmt 4, v6, v7
-+ bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0
-+ bilinear_store_\()\dst_fmt 4, v6, v7
- .endm
-
- .set BILINEAR_FLAG_USE_MASK, 1
- .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
-
- /*
- * Main template macro for generating NEON optimized bilinear scanline functions.
- *
-@@ -631,24 +632,24 @@
- bilinear_process_four_pixels, \
- bilinear_process_pixblock_head, \
- bilinear_process_pixblock_tail, \
- bilinear_process_pixblock_tail_head, \
- pixblock_size, \
- prefetch_distance, \
- flags
-
--pixman_asm_function fname
--.if pixblock_size == 8
--.elseif pixblock_size == 4
-+pixman_asm_function \fname
-+.if \pixblock_size == 8
-+.elseif \pixblock_size == 4
- .else
- .error unsupported pixblock size
- .endif
-
--.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
-+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
- OUT .req x0
- TOP .req x1
- BOTTOM .req x2
- WT .req x3
- WWT .req w3
- WB .req x4
- WWB .req w4
- X .req w5
-@@ -694,32 +695,32 @@ pixman_asm_function fname
- PF_OFFS .req x12
- TMP3 .req x13
- WTMP3 .req w13
- TMP4 .req x14
- WTMP4 .req w14
- STRIDE .req x15
- DUMMY .req x30
-
-- .set prefetch_offset, prefetch_distance
-+ .set prefetch_offset, \prefetch_distance
-
- stp x29, x30, [sp, -16]!
- mov x29, sp
- sub x29, x29, 64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- stp x10, x11, [x29, -80]
- stp x12, x13, [x29, -96]
- stp x14, x15, [x29, -112]
- str x8, [x29, -120]
- ldr w8, [x29, 16]
- sub sp, sp, 120
- .endif
-
-- mov WTMP1, #prefetch_distance
-+ mov WTMP1, #\prefetch_distance
- umull PF_OFFS, WTMP1, UX
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 300f
-
-@@ -730,73 +731,73 @@ pixman_asm_function fname
- mov v25.d[0], v12.d[1]
- mov v26.d[0], v13.d[0]
- add v25.4h, v25.4h, v26.4h
- mov v12.d[1], v25.d[0]
-
- /* ensure good destination alignment */
- cmp WIDTH, #1
- blt 100f
-- tst OUT, #(1 << dst_bpp_shift)
-+ tst OUT, #(1 << \dst_bpp_shift)
- beq 100f
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- add v12.8h, v12.8h, v13.8h
-- bilinear_process_last_pixel
-+ \bilinear_process_last_pixel
- sub WIDTH, WIDTH, #1
- 100:
- add v13.8h, v13.8h, v13.8h
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- add v12.8h, v12.8h, v13.8h
-
- cmp WIDTH, #2
- blt 100f
-- tst OUT, #(1 << (dst_bpp_shift + 1))
-+ tst OUT, #(1 << (\dst_bpp_shift + 1))
- beq 100f
-- bilinear_process_two_pixels
-+ \bilinear_process_two_pixels
- sub WIDTH, WIDTH, #2
- 100:
--.if pixblock_size == 8
-+.if \pixblock_size == 8
- cmp WIDTH, #4
- blt 100f
-- tst OUT, #(1 << (dst_bpp_shift + 2))
-+ tst OUT, #(1 << (\dst_bpp_shift + 2))
- beq 100f
-- bilinear_process_four_pixels
-+ \bilinear_process_four_pixels
- sub WIDTH, WIDTH, #4
- 100:
- .endif
-- subs WIDTH, WIDTH, #pixblock_size
-+ subs WIDTH, WIDTH, #\pixblock_size
- blt 100f
-- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
-- bilinear_process_pixblock_head
-- subs WIDTH, WIDTH, #pixblock_size
-+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
-+ \bilinear_process_pixblock_head
-+ subs WIDTH, WIDTH, #\pixblock_size
- blt 500f
- 0:
-- bilinear_process_pixblock_tail_head
-- subs WIDTH, WIDTH, #pixblock_size
-+ \bilinear_process_pixblock_tail_head
-+ subs WIDTH, WIDTH, #\pixblock_size
- bge 0b
- 500:
-- bilinear_process_pixblock_tail
-+ \bilinear_process_pixblock_tail
- 100:
--.if pixblock_size == 8
-+.if \pixblock_size == 8
- tst WIDTH, #4
- beq 200f
-- bilinear_process_four_pixels
-+ \bilinear_process_four_pixels
- 200:
- .endif
- /* handle the remaining trailing pixels */
- tst WIDTH, #2
- beq 200f
-- bilinear_process_two_pixels
-+ \bilinear_process_two_pixels
- 200:
- tst WIDTH, #1
- beq 300f
-- bilinear_process_last_pixel
-+ \bilinear_process_last_pixel
- 300:
-
--.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
-+.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
- sub x29, x29, 64
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- ldp x10, x11, [x29, -80]
- ldp x12, x13, [x29, -96]
- ldp x14, x15, [x29, -112]
- mov sp, x29
- ldp x29, x30, [sp], 16
-@@ -824,21 +825,21 @@ 300:
- .unreq WIDTH
- .unreq TMP1
- .unreq WTMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
--.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
-+.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
- .unreq MASK
- .endif
-
--.endfunc
-+pixman_end_asm_function
-
- .endm
-
- /* src_8888_8_8888 */
- .macro bilinear_src_8888_8_8888_process_last_pixel
- bilinear_interpolate_last_pixel 8888, 8, 8888, src
- .endm
-
-diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
---- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
-+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
-@@ -262,64 +262,64 @@
- uqadd v18.8b, v0.8b, v22.8b
- uqadd v19.8b, v1.8b, v23.8b
- shrn v6.8b, v4.8h, #8
- fetch_src_pixblock
- shrn v7.8b, v4.8h, #3
- sli v4.8h, v4.8h, #5
- ushll v14.8h, v17.8b, #7
- sli v14.8h, v14.8h, #1
-- PF add PF_X, PF_X, #8
-+ PF add, PF_X, PF_X, #8
- ushll v8.8h, v19.8b, #7
- sli v8.8h, v8.8h, #1
-- PF tst PF_CTL, #0xF
-+ PF tst, PF_CTL, #0xF
- sri v6.8b, v6.8b, #5
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
- 10:
- mvn v3.8b, v3.8b
-- PF beq 10f
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- sri v7.8b, v7.8b, #6
- shrn v30.8b, v4.8h, #2
- umull v10.8h, v3.8b, v6.8b
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
- umull v11.8h, v3.8b, v7.8b
- umull v12.8h, v3.8b, v30.8b
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- sri v14.8h, v8.8h, #5
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- ushll v9.8h, v18.8b, #7
- sli v9.8h, v9.8h, #1
- urshr v17.8h, v10.8h, #8
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- urshr v19.8h, v11.8h, #8
- urshr v18.8h, v12.8h, #8
-- PF ble 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- sri v14.8h, v9.8h, #11
- mov v28.d[0], v14.d[0]
- mov v29.d[0], v14.d[1]
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- 10:
- raddhn v20.8b, v10.8h, v17.8h
- raddhn v23.8b, v11.8h, v19.8h
-- PF ble 10f
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_SRC, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_SRC, #1
- 10:
- raddhn v22.8b, v12.8h, v18.8h
- st1 {v14.8h}, [DST_W], #16
- .endm
-
- #else
-
- /* If we did not care much about the performance, we would just use this... */
-@@ -469,42 +469,42 @@ generate_composite_function \
- sri v14.8h, v8.8h, #5
- sri v14.8h, v9.8h, #11
- mov v28.d[0], v14.d[0]
- mov v29.d[0], v14.d[1]
- .endm
-
- .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
- sri v14.8h, v8.8h, #5
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
- fetch_src_pixblock
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- sri v14.8h, v9.8h, #11
- mov v28.d[0], v14.d[0]
- mov v29.d[0], v14.d[1]
-- PF cmp PF_X, ORIG_W
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF cmp, PF_X, ORIG_W
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
- ushll v8.8h, v1.8b, #7
- sli v8.8h, v8.8h, #1
- st1 {v14.8h}, [DST_W], #16
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- ushll v14.8h, v2.8b, #7
- sli v14.8h, v14.8h, #1
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- 10:
- ushll v9.8h, v0.8b, #7
- sli v9.8h, v9.8h, #1
- .endm
-
- generate_composite_function \
- pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
-@@ -561,41 +561,41 @@ generate_composite_function \
- uqadd v31.8b, v3.8b, v7.8b
- .endm
-
- .macro pixman_composite_add_8_8_process_pixblock_tail
- .endm
-
- .macro pixman_composite_add_8_8_process_pixblock_tail_head
- fetch_src_pixblock
-- PF add PF_X, PF_X, #32
-- PF tst PF_CTL, #0xF
-+ PF add, PF_X, PF_X, #32
-+ PF tst, PF_CTL, #0xF
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
-- PF beq 10f
-- PF add PF_X, PF_X, #32
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #32
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF cmp PF_X, ORIG_W
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF cmp, PF_X, ORIG_W
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- uqadd v28.8b, v0.8b, v4.8b
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- uqadd v29.8b, v1.8b, v5.8b
- uqadd v30.8b, v2.8b, v6.8b
- uqadd v31.8b, v3.8b, v7.8b
- .endm
-
- generate_composite_function \
- pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
-@@ -607,41 +607,41 @@ generate_composite_function \
- pixman_composite_add_8_8_process_pixblock_head, \
- pixman_composite_add_8_8_process_pixblock_tail, \
- pixman_composite_add_8_8_process_pixblock_tail_head
-
- /******************************************************************************/
-
- .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
- fetch_src_pixblock
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF cmp PF_X, ORIG_W
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF cmp, PF_X, ORIG_W
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- uqadd v28.8b, v0.8b, v4.8b
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- uqadd v29.8b, v1.8b, v5.8b
- uqadd v30.8b, v2.8b, v6.8b
- uqadd v31.8b, v3.8b, v7.8b
- .endm
-
- generate_composite_function \
- pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
-@@ -684,55 +684,55 @@ generate_composite_function_single_scanl
- raddhn v29.8b, v15.8h, v9.8h
- raddhn v30.8b, v16.8h, v10.8h
- raddhn v31.8b, v17.8h, v11.8h
- .endm
-
- .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
- ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
- urshr v14.8h, v8.8h, #8
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
- urshr v15.8h, v9.8h, #8
- urshr v16.8h, v10.8h, #8
- urshr v17.8h, v11.8h, #8
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- raddhn v28.8b, v14.8h, v8.8h
- raddhn v29.8b, v15.8h, v9.8h
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- raddhn v30.8b, v16.8h, v10.8h
- raddhn v31.8b, v17.8h, v11.8h
- fetch_src_pixblock
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
- mvn v22.8b, v3.8b
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v8.8h, v22.8b, v4.8b
-- PF ble 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- umull v9.8h, v22.8b, v5.8b
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- 10:
- umull v10.8h, v22.8b, v6.8b
-- PF ble 10f
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- umull v11.8h, v22.8b, v7.8b
- .endm
-
- generate_composite_function_single_scanline \
- pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
-@@ -754,59 +754,59 @@ generate_composite_function_single_scanl
- uqadd v29.8b, v1.8b, v29.8b
- uqadd v30.8b, v2.8b, v30.8b
- uqadd v31.8b, v3.8b, v31.8b
- .endm
-
- .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
- ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
- urshr v14.8h, v8.8h, #8
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
- urshr v15.8h, v9.8h, #8
- urshr v16.8h, v10.8h, #8
- urshr v17.8h, v11.8h, #8
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- raddhn v28.8b, v14.8h, v8.8h
- raddhn v29.8b, v15.8h, v9.8h
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- raddhn v30.8b, v16.8h, v10.8h
- raddhn v31.8b, v17.8h, v11.8h
- uqadd v28.8b, v0.8b, v28.8b
- uqadd v29.8b, v1.8b, v29.8b
- uqadd v30.8b, v2.8b, v30.8b
- uqadd v31.8b, v3.8b, v31.8b
- fetch_src_pixblock
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
- mvn v22.8b, v3.8b
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v8.8h, v22.8b, v4.8b
-- PF ble 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- umull v9.8h, v22.8b, v5.8b
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- 10:
- umull v10.8h, v22.8b, v6.8b
-- PF ble 10f
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- umull v11.8h, v22.8b, v7.8b
- .endm
-
- generate_composite_function \
- pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
-@@ -860,40 +860,40 @@ generate_composite_function_single_scanl
- urshr v16.8h, v10.8h, #8
- urshr v17.8h, v11.8h, #8
- raddhn v28.8b, v14.8h, v8.8h
- raddhn v29.8b, v15.8h, v9.8h
- raddhn v30.8b, v16.8h, v10.8h
- raddhn v31.8b, v17.8h, v11.8h
- ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
- uqadd v28.8b, v0.8b, v28.8b
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0x0F
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0x0F
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- uqadd v29.8b, v1.8b, v29.8b
- uqadd v30.8b, v2.8b, v30.8b
- uqadd v31.8b, v3.8b, v31.8b
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- umull v8.8h, v24.8b, v4.8b
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- umull v9.8h, v24.8b, v5.8b
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v10.8h, v24.8b, v6.8b
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF subs, PF_CTL, PF_CTL, #0x10
- umull v11.8h, v24.8b, v7.8b
-- PF ble 10f
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
- .endm
-
- .macro pixman_composite_over_n_8888_init
- mov v3.s[0], w4
- dup v0.8b, v3.b[0]
- dup v1.8b, v3.b[1]
-@@ -912,52 +912,52 @@ generate_composite_function \
- pixman_composite_over_8888_8888_process_pixblock_head, \
- pixman_composite_over_8888_8888_process_pixblock_tail, \
- pixman_composite_over_n_8888_process_pixblock_tail_head
-
- /******************************************************************************/
-
- .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
- urshr v14.8h, v8.8h, #8
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
- urshr v15.8h, v9.8h, #8
- urshr v12.8h, v10.8h, #8
- urshr v13.8h, v11.8h, #8
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- raddhn v28.8b, v14.8h, v8.8h
- raddhn v29.8b, v15.8h, v9.8h
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- raddhn v30.8b, v12.8h, v10.8h
- raddhn v31.8b, v13.8h, v11.8h
- uqadd v28.8b, v0.8b, v28.8b
- uqadd v29.8b, v1.8b, v29.8b
- uqadd v30.8b, v2.8b, v30.8b
- uqadd v31.8b, v3.8b, v31.8b
- ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
- mvn v22.8b, v3.8b
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF blt 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF blt, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v8.8h, v22.8b, v4.8b
-- PF blt 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF blt, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- umull v9.8h, v22.8b, v5.8b
- umull v10.8h, v22.8b, v6.8b
-- PF blt 10f
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF blt, 10f
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- umull v11.8h, v22.8b, v7.8b
- .endm
-
- .macro pixman_composite_over_reverse_n_8888_init
- mov v7.s[0], w4
- dup v4.8b, v7.b[0]
- dup v5.8b, v7.b[1]
-@@ -1405,45 +1405,45 @@ generate_composite_function \
- rshrn v28.8b, v8.8h, #8
- rshrn v29.8b, v9.8h, #8
- rshrn v30.8b, v10.8h, #8
- rshrn v31.8b, v11.8h, #8
- .endm
-
- .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
- fetch_mask_pixblock
-- PF add PF_X, PF_X, #8
-+ PF add, PF_X, PF_X, #8
- rshrn v28.8b, v8.8h, #8
-- PF tst PF_CTL, #0x0F
-+ PF tst, PF_CTL, #0x0F
- rshrn v29.8b, v9.8h, #8
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
- 10:
- rshrn v30.8b, v10.8h, #8
-- PF beq 10f
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- rshrn v31.8b, v11.8h, #8
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- umull v8.8h, v24.8b, v0.8b
-- PF lsl DUMMY, PF_X, #mask_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
-+ PF lsl, DUMMY, PF_X, #mask_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
- umull v9.8h, v24.8b, v1.8b
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v10.8h, v24.8b, v2.8b
-- PF ble 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- umull v11.8h, v24.8b, v3.8b
-- PF ble 10f
-- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
-- PF ldrsb DUMMY, [PF_MASK, DUMMY]
-- PF add PF_MASK, PF_MASK, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-+ PF add, PF_MASK, PF_MASK, #1
- 10:
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
- ursra v8.8h, v8.8h, #8
- ursra v9.8h, v9.8h, #8
- ursra v10.8h, v10.8h, #8
- ursra v11.8h, v11.8h, #8
- .endm
-
-@@ -1486,45 +1486,45 @@ generate_composite_function \
- rshrn v28.8b, v0.8h, #8
- rshrn v29.8b, v1.8h, #8
- rshrn v30.8b, v2.8h, #8
- rshrn v31.8b, v3.8h, #8
- .endm
-
- .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
- fetch_mask_pixblock
-- PF add PF_X, PF_X, #8
-+ PF add, PF_X, PF_X, #8
- rshrn v28.8b, v0.8h, #8
-- PF tst PF_CTL, #0x0F
-+ PF tst, PF_CTL, #0x0F
- rshrn v29.8b, v1.8h, #8
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
- 10:
- rshrn v30.8b, v2.8h, #8
-- PF beq 10f
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- rshrn v31.8b, v3.8h, #8
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- umull v0.8h, v24.8b, v16.8b
-- PF lsl DUMMY, PF_X, mask_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
-+ PF lsl, DUMMY, PF_X, mask_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
- umull v1.8h, v25.8b, v16.8b
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v2.8h, v26.8b, v16.8b
-- PF ble 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- umull v3.8h, v27.8b, v16.8b
-- PF ble 10f
-- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
-- PF ldrsb DUMMY, [PF_MASK, DUMMY]
-- PF add PF_MASK, PF_MASK, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-+ PF add, PF_MASK, PF_MASK, #1
- 10:
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
- ursra v0.8h, v0.8h, #8
- ursra v1.8h, v1.8h, #8
- ursra v2.8h, v2.8h, #8
- ursra v3.8h, v3.8h, #8
- .endm
-
-@@ -1594,54 +1594,54 @@ generate_composite_function \
- .endm
-
- .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
- urshr v16.8h, v12.8h, #8
- ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
- urshr v17.8h, v13.8h, #8
- fetch_mask_pixblock
- urshr v18.8h, v14.8h, #8
-- PF add PF_X, PF_X, #8
-+ PF add, PF_X, PF_X, #8
- urshr v19.8h, v15.8h, #8
-- PF tst PF_CTL, #0x0F
-+ PF tst, PF_CTL, #0x0F
- raddhn v28.8b, v16.8h, v12.8h
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
- 10:
- raddhn v29.8b, v17.8h, v13.8h
-- PF beq 10f
-- PF sub PF_CTL, PF_CTL, #1
-+ PF beq, 10f
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- raddhn v30.8b, v18.8h, v14.8h
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- raddhn v31.8b, v19.8h, v15.8h
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- umull v16.8h, v24.8b, v8.8b
-- PF lsl DUMMY, PF_X, #mask_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
-+ PF lsl, DUMMY, PF_X, #mask_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
- umull v17.8h, v24.8b, v9.8b
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
- 10:
- umull v18.8h, v24.8b, v10.8b
-- PF ble 10f
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 10:
- umull v19.8h, v24.8b, v11.8b
-- PF ble 10f
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- 10:
- uqadd v28.8b, v0.8b, v28.8b
-- PF ble 10f
-- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
-- PF ldrsb DUMMY, [PF_MASK, DUMMY]
-- PF add PF_MASK, PF_MASK, #1
-+ PF ble, 10f
-+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-+ PF add, PF_MASK, PF_MASK, #1
- 10:
- uqadd v29.8b, v1.8b, v29.8b
- uqadd v30.8b, v2.8b, v30.8b
- uqadd v31.8b, v3.8b, v31.8b
- urshr v12.8h, v16.8h, #8
- urshr v13.8h, v17.8h, #8
- urshr v14.8h, v18.8h, #8
- urshr v15.8h, v19.8h, #8
-@@ -2407,17 +2407,17 @@ generate_composite_function \
- generate_composite_function_single_scanline \
- pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
- pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
-- pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
-+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
- /******************************************************************************/
-
- .macro pixman_composite_over_8888_n_8888_process_pixblock_head
-@@ -2482,31 +2482,31 @@ generate_composite_function \
- pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
-+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
- generate_composite_function_single_scanline \
- pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-- pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
-+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 12 /* mask_basereg */
-
- /******************************************************************************/
-
- /* TODO: expand macros and do better instructions scheduling */
-@@ -2524,17 +2524,17 @@ generate_composite_function \
- pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_8888_n_8888_process_pixblock_head, \
- pixman_composite_over_8888_n_8888_process_pixblock_tail, \
-- pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
-+ pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 0, /* src_basereg */ \
- 15 /* mask_basereg */
-
- /******************************************************************************/
-
- .macro pixman_composite_src_0888_0888_process_pixblock_head
-@@ -2675,38 +2675,38 @@ generate_composite_function \
- urshr v11.8h, v8.8h, #8
- mov v30.8b, v31.8b
- mov v31.8b, v3.8b
- mov v3.8b, v31.8b
- urshr v12.8h, v9.8h, #8
- urshr v13.8h, v10.8h, #8
- fetch_src_pixblock
- raddhn v30.8b, v11.8h, v8.8h
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- raddhn v29.8b, v12.8h, v9.8h
- raddhn v28.8b, v13.8h, v10.8h
- umull v8.8h, v3.8b, v0.8b
- umull v9.8h, v3.8b, v1.8b
- umull v10.8h, v3.8b, v2.8b
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF cmp PF_X, ORIG_W
-- PF lsl DUMMY, PF_X, src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-- PF subs PF_CTL, PF_CTL, #0x10
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF cmp, PF_X, ORIG_W
-+ PF lsl, DUMMY, PF_X, src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
-+ PF subs, PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- 10:
- .endm
-
- generate_composite_function \
- pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
-@@ -2744,38 +2744,38 @@ generate_composite_function \
- urshr v11.8h, v8.8h, #8
- mov v30.8b, v31.8b
- mov v31.8b, v3.8b
- mov v3.8b, v30.8b
- urshr v12.8h, v9.8h, #8
- urshr v13.8h, v10.8h, #8
- fetch_src_pixblock
- raddhn v28.8b, v11.8h, v8.8h
-- PF add PF_X, PF_X, #8
-- PF tst PF_CTL, #0xF
-- PF beq 10f
-- PF add PF_X, PF_X, #8
-- PF sub PF_CTL, PF_CTL, #1
-+ PF add, PF_X, PF_X, #8
-+ PF tst, PF_CTL, #0xF
-+ PF beq, 10f
-+ PF add, PF_X, PF_X, #8
-+ PF sub, PF_CTL, PF_CTL, #1
- 10:
- raddhn v29.8b, v12.8h, v9.8h
- raddhn v30.8b, v13.8h, v10.8h
- umull v8.8h, v3.8b, v0.8b
- umull v9.8h, v3.8b, v1.8b
- umull v10.8h, v3.8b, v2.8b
- st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
-- PF cmp PF_X, ORIG_W
-- PF lsl DUMMY, PF_X, src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-- PF ble 10f
-- PF sub PF_X, PF_X, ORIG_W
-- PF subs PF_CTL, PF_CTL, #0x10
-- PF ble 10f
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF cmp, PF_X, ORIG_W
-+ PF lsl, DUMMY, PF_X, src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF ble, 10f
-+ PF sub, PF_X, PF_X, ORIG_W
-+ PF subs, PF_CTL, PF_CTL, #0x10
-+ PF ble, 10f
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- 10:
- .endm
-
- generate_composite_function \
- pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
- FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 10, /* prefetch distance */ \
-@@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan
- * format conversion, and interpolation as separate macros which can be used
- * as the basic building blocks for constructing bilinear scanline functions.
- */
-
- .macro bilinear_load_8888 reg1, reg2, tmp
- asr TMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #2
-- ld1 {&reg1&.2s}, [TMP1], STRIDE
-- ld1 {&reg2&.2s}, [TMP1]
-+ ld1 {\()\reg1\().2s}, [TMP1], STRIDE
-+ ld1 {\()\reg2\().2s}, [TMP1]
- .endm
-
- .macro bilinear_load_0565 reg1, reg2, tmp
- asr TMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
-- ld1 {&reg2&.s}[0], [TMP1], STRIDE
-- ld1 {&reg2&.s}[1], [TMP1]
-- convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
-+ ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
-+ ld1 {\()\reg2\().s}[1], [TMP1]
-+ convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_two_8888 \
- acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
-
-- bilinear_load_8888 reg1, reg2, tmp1
-- umull &acc1&.8h, &reg1&.8b, v28.8b
-- umlal &acc1&.8h, &reg2&.8b, v29.8b
-- bilinear_load_8888 reg3, reg4, tmp2
-- umull &acc2&.8h, &reg3&.8b, v28.8b
-- umlal &acc2&.8h, &reg4&.8b, v29.8b
-+ bilinear_load_8888 \reg1, \reg2, \tmp1
-+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
-+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
-+ bilinear_load_8888 \reg3, \reg4, \tmp2
-+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
-+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_four_8888 \
-- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-
- bilinear_load_and_vertical_interpolate_two_8888 \
-- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
-+ \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
- bilinear_load_and_vertical_interpolate_two_8888 \
-- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
-+ \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
- .endm
-
- .macro vzip reg1, reg2
- umov TMP4, v31.d[0]
-- zip1 v31.8b, reg1, reg2
-- zip2 reg2, reg1, reg2
-- mov reg1, v31.8b
-+ zip1 v31.8b, \reg1, \reg2
-+ zip2 \reg2, \reg1, \reg2
-+ mov \reg1, v31.8b
- mov v31.d[0], TMP4
- .endm
-
- .macro vuzp reg1, reg2
- umov TMP4, v31.d[0]
-- uzp1 v31.8b, reg1, reg2
-- uzp2 reg2, reg1, reg2
-- mov reg1, v31.8b
-+ uzp1 v31.8b, \reg1, \reg2
-+ uzp2 \reg2, \reg1, \reg2
-+ mov \reg1, v31.8b
- mov v31.d[0], TMP4
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_two_0565 \
- acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
- asr TMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
- asr TMP2, X, #16
- add X, X, UX
- add TMP2, TOP, TMP2, lsl #1
-- ld1 {&acc2&.s}[0], [TMP1], STRIDE
-- ld1 {&acc2&.s}[2], [TMP2], STRIDE
-- ld1 {&acc2&.s}[1], [TMP1]
-- ld1 {&acc2&.s}[3], [TMP2]
-- convert_0565_to_x888 acc2, reg3, reg2, reg1
-- vzip &reg1&.8b, &reg3&.8b
-- vzip &reg2&.8b, &reg4&.8b
-- vzip &reg3&.8b, &reg4&.8b
-- vzip &reg1&.8b, &reg2&.8b
-- umull &acc1&.8h, &reg1&.8b, v28.8b
-- umlal &acc1&.8h, &reg2&.8b, v29.8b
-- umull &acc2&.8h, &reg3&.8b, v28.8b
-- umlal &acc2&.8h, &reg4&.8b, v29.8b
-+ ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
-+ ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
-+ ld1 {\()\acc2\().s}[1], [TMP1]
-+ ld1 {\()\acc2\().s}[3], [TMP2]
-+ convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
-+ vzip \()\reg1\().8b, \()\reg3\().8b
-+ vzip \()\reg2\().8b, \()\reg4\().8b
-+ vzip \()\reg3\().8b, \()\reg4\().8b
-+ vzip \()\reg1\().8b, \()\reg2\().8b
-+ umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
-+ umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
-+ umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
-+ umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
- .endm
-
- .macro bilinear_load_and_vertical_interpolate_four_0565 \
-- xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
-+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
- yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
- asr TMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
- asr TMP2, X, #16
- add X, X, UX
- add TMP2, TOP, TMP2, lsl #1
-- ld1 {&xacc2&.s}[0], [TMP1], STRIDE
-- ld1 {&xacc2&.s}[2], [TMP2], STRIDE
-- ld1 {&xacc2&.s}[1], [TMP1]
-- ld1 {&xacc2&.s}[3], [TMP2]
-- convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
-+ ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
-+ ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
-+ ld1 {\()\xacc2\().s}[1], [TMP1]
-+ ld1 {\()\xacc2\().s}[3], [TMP2]
-+ convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
- asr TMP1, X, #16
- add X, X, UX
- add TMP1, TOP, TMP1, lsl #1
- asr TMP2, X, #16
- add X, X, UX
- add TMP2, TOP, TMP2, lsl #1
-- ld1 {&yacc2&.s}[0], [TMP1], STRIDE
-- vzip &xreg1&.8b, &xreg3&.8b
-- ld1 {&yacc2&.s}[2], [TMP2], STRIDE
-- vzip &xreg2&.8b, &xreg4&.8b
-- ld1 {&yacc2&.s}[1], [TMP1]
-- vzip &xreg3&.8b, &xreg4&.8b
-- ld1 {&yacc2&.s}[3], [TMP2]
-- vzip &xreg1&.8b, &xreg2&.8b
-- convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
-- umull &xacc1&.8h, &xreg1&.8b, v28.8b
-- vzip &yreg1&.8b, &yreg3&.8b
-- umlal &xacc1&.8h, &xreg2&.8b, v29.8b
-- vzip &yreg2&.8b, &yreg4&.8b
-- umull &xacc2&.8h, &xreg3&.8b, v28.8b
-- vzip &yreg3&.8b, &yreg4&.8b
-- umlal &xacc2&.8h, &xreg4&.8b, v29.8b
-- vzip &yreg1&.8b, &yreg2&.8b
-- umull &yacc1&.8h, &yreg1&.8b, v28.8b
-- umlal &yacc1&.8h, &yreg2&.8b, v29.8b
-- umull &yacc2&.8h, &yreg3&.8b, v28.8b
-- umlal &yacc2&.8h, &yreg4&.8b, v29.8b
-+ ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
-+ vzip \()\xreg1\().8b, \()\xreg3\().8b
-+ ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
-+ vzip \()\xreg2\().8b, \()\xreg4\().8b
-+ ld1 {\()\yacc2\().s}[1], [TMP1]
-+ vzip \()\xreg3\().8b, \()\xreg4\().8b
-+ ld1 {\()\yacc2\().s}[3], [TMP2]
-+ vzip \()\xreg1\().8b, \()\xreg2\().8b
-+ convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
-+ umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
-+ vzip \()\yreg1\().8b, \()\yreg3\().8b
-+ umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
-+ vzip \()\yreg2\().8b, \()\yreg4\().8b
-+ umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
-+ vzip \()\yreg3\().8b, \()\yreg4\().8b
-+ umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
-+ vzip \()\yreg1\().8b, \()\yreg2\().8b
-+ umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
-+ umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
-+ umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
-+ umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
- .endm
-
- .macro bilinear_store_8888 numpix, tmp1, tmp2
--.if numpix == 4
-+.if \numpix == 4
- st1 {v0.2s, v1.2s}, [OUT], #16
--.elseif numpix == 2
-+.elseif \numpix == 2
- st1 {v0.2s}, [OUT], #8
--.elseif numpix == 1
-+.elseif \numpix == 1
- st1 {v0.s}[0], [OUT], #4
- .else
-- .error bilinear_store_8888 numpix is unsupported
-+ .error bilinear_store_8888 \numpix is unsupported
- .endif
- .endm
-
- .macro bilinear_store_0565 numpix, tmp1, tmp2
- vuzp v0.8b, v1.8b
- vuzp v2.8b, v3.8b
- vuzp v1.8b, v3.8b
- vuzp v0.8b, v2.8b
-- convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
--.if numpix == 4
-+ convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
-+.if \numpix == 4
- st1 {v1.4h}, [OUT], #8
--.elseif numpix == 2
-+.elseif \numpix == 2
- st1 {v1.s}[0], [OUT], #4
--.elseif numpix == 1
-+.elseif \numpix == 1
- st1 {v1.h}[0], [OUT], #2
- .else
-- .error bilinear_store_0565 numpix is unsupported
-+ .error bilinear_store_0565 \numpix is unsupported
- .endif
- .endm
-
- .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
-- bilinear_load_&src_fmt v0, v1, v2
-+ bilinear_load_\()\src_fmt v0, v1, v2
- umull v2.8h, v0.8b, v28.8b
- umlal v2.8h, v1.8b, v29.8b
- /* 5 cycles bubble */
- ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v0.4s, v2.4h, v15.h[0]
- umlal2 v0.4s, v2.8h, v15.h[0]
- /* 5 cycles bubble */
- shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- /* 3 cycles bubble */
- xtn v0.8b, v0.8h
- /* 1 cycle bubble */
-- bilinear_store_&dst_fmt 1, v3, v4
-+ bilinear_store_\()\dst_fmt 1, v3, v4
- .endm
-
- .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
-- bilinear_load_and_vertical_interpolate_two_&src_fmt \
-+ bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
- v1, v11, v2, v3, v20, v21, v22, v23
- ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v0.4s, v1.4h, v15.h[0]
- umlal2 v0.4s, v1.8h, v15.h[0]
- ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v10.4s, v11.4h, v15.h[4]
- umlal2 v10.4s, v11.8h, v15.h[4]
- shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- add v12.8h, v12.8h, v13.8h
- xtn v0.8b, v0.8h
-- bilinear_store_&dst_fmt 2, v3, v4
-+ bilinear_store_\()\dst_fmt 2, v3, v4
- .endm
-
- .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
-- bilinear_load_and_vertical_interpolate_four_&src_fmt \
-- v1, v11, v14, v20, v16, v17, v22, v23 \
-+ bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
-+ v1, v11, v14, v20, v16, v17, v22, v23, \
- v3, v9, v24, v25, v26, v27, v18, v19
- prfm PREFETCH_MODE, [TMP1, PF_OFFS]
- sub TMP1, TMP1, STRIDE
- ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v0.4s, v1.4h, v15.h[0]
- umlal2 v0.4s, v1.8h, v15.h[0]
- ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
- umlsl v10.4s, v11.4h, v15.h[4]
-@@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan
- shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- xtn v0.8b, v0.8h
- xtn v1.8b, v2.8h
- add v12.8h, v12.8h, v13.8h
-- bilinear_store_&dst_fmt 4, v3, v4
-+ bilinear_store_\()\dst_fmt 4, v3, v4
- .endm
-
- .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
--.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
-+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
-+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
- .else
-- bilinear_interpolate_four_pixels src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
- .endif
- .endm
-
- .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
--.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
-+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
-+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
- .endif
- .endm
-
- .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
--.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
-- bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
-+.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
-+ bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
- .else
-- bilinear_interpolate_four_pixels src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
- .endif
- .endm
-
- .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
--.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
-+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
-+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
- .else
-- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
-+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
- .endif
- .endm
-
- .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
--.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
-+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
-+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
- .else
-- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
- .endif
- .endm
-
- .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
--.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
-- bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
-+.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
-+ bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
- .else
-- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
-+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
- .endif
- .endm
-
- .set BILINEAR_FLAG_UNROLL_4, 0
- .set BILINEAR_FLAG_UNROLL_8, 1
- .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
-
- /*
-@@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan
- * prefetch_distance - prefetch in the source image by that many
- * pixels ahead
- */
-
- .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
- src_bpp_shift, dst_bpp_shift, \
- prefetch_distance, flags
-
--pixman_asm_function fname
-+pixman_asm_function \fname
- OUT .req x0
- TOP .req x1
- BOTTOM .req x2
- WT .req x3
- WB .req x4
- X .req x5
- UX .req x6
- WIDTH .req x7
-@@ -3437,17 +3437,17 @@ pixman_asm_function fname
- sub sp, sp, 112 /* push all registers */
- sub x29, x29, 64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
- stp x8, x9, [x29, -80]
- stp x10, x11, [x29, -96]
- stp x12, x13, [x29, -112]
-
-- mov PF_OFFS, #prefetch_distance
-+ mov PF_OFFS, #\prefetch_distance
- mul PF_OFFS, PF_OFFS, UX
-
- subs STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 300f
-
-@@ -3458,85 +3458,85 @@ pixman_asm_function fname
- mov v25.d[0], v12.d[1]
- mov v26.d[0], v13.d[0]
- add v25.4h, v25.4h, v26.4h
- mov v12.d[1], v25.d[0]
-
- /* ensure good destination alignment */
- cmp WIDTH, #1
- blt 100f
-- tst OUT, #(1 << dst_bpp_shift)
-+ tst OUT, #(1 << \dst_bpp_shift)
- beq 100f
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- add v12.8h, v12.8h, v13.8h
-- bilinear_interpolate_last_pixel src_fmt, dst_fmt
-+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
- sub WIDTH, WIDTH, #1
- 100:
- add v13.8h, v13.8h, v13.8h
- ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
- add v12.8h, v12.8h, v13.8h
-
- cmp WIDTH, #2
- blt 100f
-- tst OUT, #(1 << (dst_bpp_shift + 1))
-+ tst OUT, #(1 << (\dst_bpp_shift + 1))
- beq 100f
-- bilinear_interpolate_two_pixels src_fmt, dst_fmt
-+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
- sub WIDTH, WIDTH, #2
- 100:
--.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
-+.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
- /*********** 8 pixels per iteration *****************/
- cmp WIDTH, #4
- blt 100f
-- tst OUT, #(1 << (dst_bpp_shift + 2))
-+ tst OUT, #(1 << (\dst_bpp_shift + 2))
- beq 100f
-- bilinear_interpolate_four_pixels src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
- sub WIDTH, WIDTH, #4
- 100:
- subs WIDTH, WIDTH, #8
- blt 100f
-- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
-- bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
-+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
-+ bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
- subs WIDTH, WIDTH, #8
- blt 500f
- 1000:
-- bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
-+ bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
- subs WIDTH, WIDTH, #8
- bge 1000b
- 500:
-- bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
-+ bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
- 100:
- tst WIDTH, #4
- beq 200f
-- bilinear_interpolate_four_pixels src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
- 200:
- .else
- /*********** 4 pixels per iteration *****************/
- subs WIDTH, WIDTH, #4
- blt 100f
-- asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
-- bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
-+ asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
-+ bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
- subs WIDTH, WIDTH, #4
- blt 500f
- 1000:
-- bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
- subs WIDTH, WIDTH, #4
- bge 1000b
- 500:
-- bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
-+ bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
- 100:
- /****************************************************/
- .endif
- /* handle the remaining trailing pixels */
- tst WIDTH, #2
- beq 200f
-- bilinear_interpolate_two_pixels src_fmt, dst_fmt
-+ bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
- 200:
- tst WIDTH, #1
- beq 300f
-- bilinear_interpolate_last_pixel src_fmt, dst_fmt
-+ bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
- 300:
- sub x29, x29, 64
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
- ldp x8, x9, [x29, -80]
- ldp x10, x11, [x29, -96]
- ldp x12, x13, [x29, -104]
- mov sp, x29
-@@ -3551,17 +3551,17 @@ 300:
- .unreq UX
- .unreq WIDTH
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
--.endfunc
-+pixman_end_asm_function
-
- .endm
-
- /*****************************************************************************/
-
- .set have_bilinear_interpolate_four_pixels_8888_8888, 1
-
- .macro bilinear_interpolate_four_pixels_8888_8888_head
-diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
---- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
-+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
-@@ -75,340 +75,340 @@
- #define PREFETCH_MODE pldl1keep
-
- /*
- * Definitions of supplementary pixld/pixst macros (for partial load/store of
- * pixel data).
- */
-
- .macro pixldst1 op, elem_size, reg1, mem_operand, abits
-- op {v&reg1&.&elem_size}, [&mem_operand&], #8
-+ \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8
- .endm
-
- .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
-- op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
-+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16
- .endm
-
- .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
-- op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size, v&reg4&.&elem_size}, [&mem_operand&], #32
-+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32
- .endm
-
- .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
-- op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
-+ \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\()
- .endm
-
- .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
-- op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}, [&mem_operand&], #24
-+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24
- .endm
-
- .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
-- op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}[idx], [&mem_operand&], #3
-+ \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3
- .endm
-
- .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
--.if numbytes == 32
-- .if elem_size==32
-- pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
-- %(basereg+6), %(basereg+7), mem_operand, abits
-- .elseif elem_size==16
-- pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
-- %(basereg+6), %(basereg+7), mem_operand, abits
-+.if \numbytes == 32
-+ .if \elem_size==32
-+ pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \
-+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
-+ .elseif \elem_size==16
-+ pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \
-+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
- .else
-- pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
-- %(basereg+6), %(basereg+7), mem_operand, abits
-+ pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \
-+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
- .endif
--.elseif numbytes == 16
-- .if elem_size==32
-- pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
-- .elseif elem_size==16
-- pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
-+.elseif \numbytes == 16
-+ .if \elem_size==32
-+ pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
-+ .elseif \elem_size==16
-+ pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
- .else
-- pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
-+ pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
- .endif
--.elseif numbytes == 8
-- .if elem_size==32
-- pixldst1 op, 2s, %(basereg+1), mem_operand, abits
-- .elseif elem_size==16
-- pixldst1 op, 4h, %(basereg+1), mem_operand, abits
-+.elseif \numbytes == 8
-+ .if \elem_size==32
-+ pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits
-+ .elseif \elem_size==16
-+ pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits
- .else
-- pixldst1 op, 8b, %(basereg+1), mem_operand, abits
-+ pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits
- .endif
--.elseif numbytes == 4
-- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
-- pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
-- .elseif elem_size == 16
-- pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
-- pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
-+.elseif \numbytes == 4
-+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
-+ pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4
-+ .elseif \elem_size == 16
-+ pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2
-+ pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2
- .else
-- pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
-- pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
-- pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
-- pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
-+ pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1
-+ pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1
-+ pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1
-+ pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1
- .endif
--.elseif numbytes == 2
-- .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
-- pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
-+.elseif \numbytes == 2
-+ .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
-+ pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2
- .else
-- pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
-- pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
-+ pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1
-+ pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1
- .endif
--.elseif numbytes == 1
-- pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
-+.elseif \numbytes == 1
-+ pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1
- .else
-- .error "unsupported size: numbytes"
-+ .error "unsupported size: \numbytes"
- .endif
- .endm
-
- .macro pixld numpix, bpp, basereg, mem_operand, abits=0
--.if bpp > 0
--.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-- pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
-- %(basereg+6), %(basereg+7), mem_operand, abits
--.elseif (bpp == 24) && (numpix == 8)
-- pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
--.elseif (bpp == 24) && (numpix == 4)
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
--.elseif (bpp == 24) && (numpix == 2)
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
--.elseif (bpp == 24) && (numpix == 1)
-- pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
-+.if \bpp > 0
-+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-+ pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \
-+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
-+.elseif (\bpp == 24) && (\numpix == 8)
-+ pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
-+.elseif (\bpp == 24) && (\numpix == 4)
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
-+.elseif (\bpp == 24) && (\numpix == 2)
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
-+.elseif (\bpp == 24) && (\numpix == 1)
-+ pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
- .else
-- pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
-+ pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits
- .endif
- .endif
- .endm
-
- .macro pixst numpix, bpp, basereg, mem_operand, abits=0
--.if bpp > 0
--.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-- pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
-- %(basereg+6), %(basereg+7), mem_operand, abits
--.elseif (bpp == 24) && (numpix == 8)
-- pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
--.elseif (bpp == 24) && (numpix == 4)
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
--.elseif (bpp == 24) && (numpix == 2)
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
--.elseif (bpp == 24) && (numpix == 1)
-- pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
--.elseif numpix * bpp == 32 && abits == 32
-- pixldst 4, st1, 32, basereg, mem_operand, abits
--.elseif numpix * bpp == 16 && abits == 16
-- pixldst 2, st1, 16, basereg, mem_operand, abits
-+.if \bpp > 0
-+.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-+ pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \
-+ %(\basereg+6), %(\basereg+7), \mem_operand, \abits
-+.elseif (\bpp == 24) && (\numpix == 8)
-+ pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
-+.elseif (\bpp == 24) && (\numpix == 4)
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
-+.elseif (\bpp == 24) && (\numpix == 2)
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
-+.elseif (\bpp == 24) && (\numpix == 1)
-+ pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
-+.elseif \numpix * \bpp == 32 && \abits == 32
-+ pixldst 4, st1, 32, \basereg, \mem_operand, \abits
-+.elseif \numpix * \bpp == 16 && \abits == 16
-+ pixldst 2, st1, 16, \basereg, \mem_operand, \abits
- .else
-- pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
-+ pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits
- .endif
- .endif
- .endm
-
- .macro pixld_a numpix, bpp, basereg, mem_operand
--.if (bpp * numpix) <= 128
-- pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
-+.if (\bpp * \numpix) <= 128
-+ pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
- .else
-- pixld numpix, bpp, basereg, mem_operand, 128
-+ pixld \numpix, \bpp, \basereg, \mem_operand, 128
- .endif
- .endm
-
- .macro pixst_a numpix, bpp, basereg, mem_operand
--.if (bpp * numpix) <= 128
-- pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
-+.if (\bpp * \numpix) <= 128
-+ pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
- .else
-- pixst numpix, bpp, basereg, mem_operand, 128
-+ pixst \numpix, \bpp, \basereg, \mem_operand, 128
- .endif
- .endm
-
- /*
- * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
- * aliases to be defined)
- */
- .macro pixld1_s elem_size, reg1, mem_operand
--.if elem_size == 16
-+.if \elem_size == 16
- asr TMP1, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP1, mem_operand, TMP1, lsl #1
-+ add TMP1, \mem_operand, TMP1, lsl #1
- asr TMP2, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP2, mem_operand, TMP2, lsl #1
-- ld1 {v&reg1&.h}[0], [TMP1]
-+ add TMP2, \mem_operand, TMP2, lsl #1
-+ ld1 {v\()\reg1\().h}[0], [TMP1]
- asr TMP1, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP1, mem_operand, TMP1, lsl #1
-- ld1 {v&reg1&.h}[1], [TMP2]
-+ add TMP1, \mem_operand, TMP1, lsl #1
-+ ld1 {v\()\reg1\().h}[1], [TMP2]
- asr TMP2, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP2, mem_operand, TMP2, lsl #1
-- ld1 {v&reg1&.h}[2], [TMP1]
-- ld1 {v&reg1&.h}[3], [TMP2]
--.elseif elem_size == 32
-+ add TMP2, \mem_operand, TMP2, lsl #1
-+ ld1 {v\()\reg1\().h}[2], [TMP1]
-+ ld1 {v\()\reg1\().h}[3], [TMP2]
-+.elseif \elem_size == 32
- asr TMP1, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP1, mem_operand, TMP1, lsl #2
-+ add TMP1, \mem_operand, TMP1, lsl #2
- asr TMP2, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP2, mem_operand, TMP2, lsl #2
-- ld1 {v&reg1&.s}[0], [TMP1]
-- ld1 {v&reg1&.s}[1], [TMP2]
-+ add TMP2, \mem_operand, TMP2, lsl #2
-+ ld1 {v\()\reg1\().s}[0], [TMP1]
-+ ld1 {v\()\reg1\().s}[1], [TMP2]
- .else
- .error "unsupported"
- .endif
- .endm
-
- .macro pixld2_s elem_size, reg1, reg2, mem_operand
--.if 0 /* elem_size == 32 */
-+.if 0 /* \elem_size == 32 */
- mov TMP1, VX, asr #16
- add VX, VX, UNIT_X, asl #1
-- add TMP1, mem_operand, TMP1, asl #2
-+ add TMP1, \mem_operand, TMP1, asl #2
- mov TMP2, VX, asr #16
- sub VX, VX, UNIT_X
-- add TMP2, mem_operand, TMP2, asl #2
-- ld1 {v&reg1&.s}[0], [TMP1]
-+ add TMP2, \mem_operand, TMP2, asl #2
-+ ld1 {v\()\reg1\().s}[0], [TMP1]
- mov TMP1, VX, asr #16
- add VX, VX, UNIT_X, asl #1
-- add TMP1, mem_operand, TMP1, asl #2
-- ld1 {v&reg2&.s}[0], [TMP2, :32]
-+ add TMP1, \mem_operand, TMP1, asl #2
-+ ld1 {v\()\reg2\().s}[0], [TMP2, :32]
- mov TMP2, VX, asr #16
- add VX, VX, UNIT_X
-- add TMP2, mem_operand, TMP2, asl #2
-- ld1 {v&reg1&.s}[1], [TMP1]
-- ld1 {v&reg2&.s}[1], [TMP2]
-+ add TMP2, \mem_operand, TMP2, asl #2
-+ ld1 {v\()\reg1\().s}[1], [TMP1]
-+ ld1 {v\()\reg2\().s}[1], [TMP2]
- .else
-- pixld1_s elem_size, reg1, mem_operand
-- pixld1_s elem_size, reg2, mem_operand
-+ pixld1_s \elem_size, \reg1, \mem_operand
-+ pixld1_s \elem_size, \reg2, \mem_operand
- .endif
- .endm
-
- .macro pixld0_s elem_size, reg1, idx, mem_operand
--.if elem_size == 16
-+.if \elem_size == 16
- asr TMP1, VX, #16
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP1, mem_operand, TMP1, lsl #1
-- ld1 {v&reg1&.h}[idx], [TMP1]
--.elseif elem_size == 32
-+ add TMP1, \mem_operand, TMP1, lsl #1
-+ ld1 {v\()\reg1\().h}[\idx], [TMP1]
-+.elseif \elem_size == 32
- asr DUMMY, VX, #16
- mov TMP1, DUMMY
- adds VX, VX, UNIT_X
- bmi 55f
- 5: subs VX, VX, SRC_WIDTH_FIXED
- bpl 5b
- 55:
-- add TMP1, mem_operand, TMP1, lsl #2
-- ld1 {v&reg1&.s}[idx], [TMP1]
-+ add TMP1, \mem_operand, TMP1, lsl #2
-+ ld1 {v\()\reg1\().s}[\idx], [TMP1]
- .endif
- .endm
-
- .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
--.if numbytes == 32
-- pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
-- pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
-- pixdeinterleave elem_size, %(basereg+4)
--.elseif numbytes == 16
-- pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
--.elseif numbytes == 8
-- pixld1_s elem_size, %(basereg+1), mem_operand
--.elseif numbytes == 4
-- .if elem_size == 32
-- pixld0_s elem_size, %(basereg+0), 1, mem_operand
-- .elseif elem_size == 16
-- pixld0_s elem_size, %(basereg+0), 2, mem_operand
-- pixld0_s elem_size, %(basereg+0), 3, mem_operand
-+.if \numbytes == 32
-+ pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
-+ pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
-+ pixdeinterleave \elem_size, %(\basereg+4)
-+.elseif \numbytes == 16
-+ pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
-+.elseif \numbytes == 8
-+ pixld1_s \elem_size, %(\basereg+1), \mem_operand
-+.elseif \numbytes == 4
-+ .if \elem_size == 32
-+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
-+ .elseif \elem_size == 16
-+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
- .else
-- pixld0_s elem_size, %(basereg+0), 4, mem_operand
-- pixld0_s elem_size, %(basereg+0), 5, mem_operand
-- pixld0_s elem_size, %(basereg+0), 6, mem_operand
-- pixld0_s elem_size, %(basereg+0), 7, mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
- .endif
--.elseif numbytes == 2
-- .if elem_size == 16
-- pixld0_s elem_size, %(basereg+0), 1, mem_operand
-+.elseif \numbytes == 2
-+ .if \elem_size == 16
-+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
- .else
-- pixld0_s elem_size, %(basereg+0), 2, mem_operand
-- pixld0_s elem_size, %(basereg+0), 3, mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
-+ pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
- .endif
--.elseif numbytes == 1
-- pixld0_s elem_size, %(basereg+0), 1, mem_operand
-+.elseif \numbytes == 1
-+ pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
- .else
-- .error "unsupported size: numbytes"
-+ .error "unsupported size: \numbytes"
- .endif
- .endm
-
- .macro pixld_s numpix, bpp, basereg, mem_operand
--.if bpp > 0
-- pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
-+.if \bpp > 0
-+ pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
- .endif
- .endm
-
- .macro vuzp8 reg1, reg2
- umov DUMMY, v16.d[0]
-- uzp1 v16.8b, v&reg1&.8b, v&reg2&.8b
-- uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
-- mov v&reg1&.8b, v16.8b
-+ uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
-+ uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
-+ mov v\()\reg1\().8b, v16.8b
- mov v16.d[0], DUMMY
- .endm
-
- .macro vzip8 reg1, reg2
- umov DUMMY, v16.d[0]
-- zip1 v16.8b, v&reg1&.8b, v&reg2&.8b
-- zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
-- mov v&reg1&.8b, v16.8b
-+ zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
-+ zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
-+ mov v\()\reg1\().8b, v16.8b
- mov v16.d[0], DUMMY
- .endm
-
- /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
- .macro pixdeinterleave bpp, basereg
--.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-- vuzp8 %(basereg+0), %(basereg+1)
-- vuzp8 %(basereg+2), %(basereg+3)
-- vuzp8 %(basereg+1), %(basereg+3)
-- vuzp8 %(basereg+0), %(basereg+2)
-+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-+ vuzp8 %(\basereg+0), %(\basereg+1)
-+ vuzp8 %(\basereg+2), %(\basereg+3)
-+ vuzp8 %(\basereg+1), %(\basereg+3)
-+ vuzp8 %(\basereg+0), %(\basereg+2)
- .endif
- .endm
-
- /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
- .macro pixinterleave bpp, basereg
--.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-- vzip8 %(basereg+0), %(basereg+2)
-- vzip8 %(basereg+1), %(basereg+3)
-- vzip8 %(basereg+2), %(basereg+3)
-- vzip8 %(basereg+0), %(basereg+1)
-+.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
-+ vzip8 %(\basereg+0), %(\basereg+2)
-+ vzip8 %(\basereg+1), %(\basereg+3)
-+ vzip8 %(\basereg+2), %(\basereg+3)
-+ vzip8 %(\basereg+0), %(\basereg+1)
- .endif
- .endm
-
- /*
- * This is a macro for implementing cache preload. The main idea is that
- * cache preload logic is mostly independent from the rest of pixels
- * processing code. It starts at the top left pixel and moves forward
- * across pixels and can jump across scanlines. Prefetch distance is
-@@ -432,62 +432,62 @@ 55:
- * for almost zero cost!
- *
- * (*) The overhead of the prefetcher is visible when running some trivial
- * pixels processing like simple copy. Anyway, having prefetch is a must
- * when working with the graphics data.
- */
- .macro PF a, x:vararg
- .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
-- a x
-+ \a \x
- .endif
- .endm
-
- .macro cache_preload std_increment, boost_increment
- .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
--.if std_increment != 0
-- PF add PF_X, PF_X, #std_increment
-+.if \std_increment != 0
-+ PF add, PF_X, PF_X, #\std_increment
- .endif
-- PF tst PF_CTL, #0xF
-- PF beq 71f
-- PF add PF_X, PF_X, #boost_increment
-- PF sub PF_CTL, PF_CTL, #1
-+ PF tst, PF_CTL, #0xF
-+ PF beq, 71f
-+ PF add, PF_X, PF_X, #\boost_increment
-+ PF sub, PF_CTL, PF_CTL, #1
- 71:
-- PF cmp PF_X, ORIG_W
-+ PF cmp, PF_X, ORIG_W
- .if src_bpp_shift >= 0
-- PF lsl DUMMY, PF_X, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
-+ PF lsl, DUMMY, PF_X, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
- .endif
- .if dst_r_bpp != 0
-- PF lsl DUMMY, PF_X, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
-+ PF lsl, DUMMY, PF_X, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
- .endif
- .if mask_bpp_shift >= 0
-- PF lsl DUMMY, PF_X, #mask_bpp_shift
-- PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
-+ PF lsl, DUMMY, PF_X, #mask_bpp_shift
-+ PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
- .endif
-- PF ble 71f
-- PF sub PF_X, PF_X, ORIG_W
-- PF subs PF_CTL, PF_CTL, #0x10
-+ PF ble, 71f
-+ PF sub, PF_X, PF_X, ORIG_W
-+ PF subs, PF_CTL, PF_CTL, #0x10
- 71:
-- PF ble 72f
-+ PF ble, 72f
- .if src_bpp_shift >= 0
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF ldrsb DUMMY, [PF_SRC, DUMMY]
-- PF add PF_SRC, PF_SRC, #1
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF ldrsb, DUMMY, [PF_SRC, DUMMY]
-+ PF add, PF_SRC, PF_SRC, #1
- .endif
- .if dst_r_bpp != 0
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF ldrsb DUMMY, [PF_DST, DUMMY]
-- PF add PF_DST, PF_DST, #1
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF ldrsb, DUMMY, [PF_DST, DUMMY]
-+ PF add, PF_DST, PF_DST, #1
- .endif
- .if mask_bpp_shift >= 0
-- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
-- PF ldrsb DUMMY, [PF_MASK, DUMMY]
-- PF add PF_MASK, PF_MASK, #1
-+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-+ PF ldrsb, DUMMY, [PF_MASK, DUMMY]
-+ PF add, PF_MASK, PF_MASK, #1
- .endif
- 72:
- .endif
- .endm
-
- .macro cache_preload_simple
- .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
- .if src_bpp > 0
-@@ -516,56 +516,56 @@ 72:
- process_pixblock_tail, \
- process_pixblock_tail_head
- .if dst_w_bpp != 24
- tst DST_R, #0xF
- beq 52f
-
- .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
- .irp lowbit, 1, 2, 4, 8, 16
--local skip1
--.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
--.if lowbit < 16 /* we don't need more than 16-byte alignment */
-- tst DST_R, #lowbit
-+
-+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
-+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
-+ tst DST_R, #\lowbit
- beq 51f
- .endif
-- pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
-- pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
-+ pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
-+ pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
- .if dst_r_bpp > 0
-- pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
-+ pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
- .else
-- add DST_R, DST_R, #lowbit
-+ add DST_R, DST_R, #\lowbit
- .endif
-- PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
-- sub W, W, #(lowbit * 8 / dst_w_bpp)
-+ PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
-+ sub W, W, #(\lowbit * 8 / dst_w_bpp)
- 51:
- .endif
- .endr
- .endif
- pixdeinterleave src_bpp, src_basereg
- pixdeinterleave mask_bpp, mask_basereg
- pixdeinterleave dst_r_bpp, dst_r_basereg
-
-- process_pixblock_head
-+ \process_pixblock_head
- cache_preload 0, pixblock_size
- cache_preload_simple
-- process_pixblock_tail
-+ \process_pixblock_tail
-
- pixinterleave dst_w_bpp, dst_w_basereg
-
- .irp lowbit, 1, 2, 4, 8, 16
--.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
--.if lowbit < 16 /* we don't need more than 16-byte alignment */
-- tst DST_W, #lowbit
-+.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
-+.if \lowbit < 16 /* we don't need more than 16-byte alignment */
-+ tst DST_W, #\lowbit
- beq 51f
- .endif
- .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
-- sub W, W, #(lowbit * 8 / dst_w_bpp)
-+ sub W, W, #(\lowbit * 8 / dst_w_bpp)
- .endif
-- pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
-+ pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
- 51:
- .endif
- .endr
- .endif
- 52:
- .endm
-
- /*
-@@ -587,52 +587,52 @@ 52:
- dst_aligned_flag, \
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head
- tst W, #(pixblock_size - 1)
- beq 52f
- .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
- .irp chunk_size, 16, 8, 4, 2, 1
--.if pixblock_size > chunk_size
-- tst W, #chunk_size
-+.if pixblock_size > \chunk_size
-+ tst W, #\chunk_size
- beq 51f
-- pixld_src chunk_size, src_bpp, src_basereg, SRC
-- pixld chunk_size, mask_bpp, mask_basereg, MASK
--.if dst_aligned_flag != 0
-- pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
-+ pixld_src \chunk_size, src_bpp, src_basereg, SRC
-+ pixld \chunk_size, mask_bpp, mask_basereg, MASK
-+.if \dst_aligned_flag != 0
-+ pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
- .else
-- pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
-+ pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
- .endif
--.if cache_preload_flag != 0
-- PF add PF_X, PF_X, #chunk_size
-+.if \cache_preload_flag != 0
-+ PF add, PF_X, PF_X, #\chunk_size
- .endif
- 51:
- .endif
- .endr
- .endif
- pixdeinterleave src_bpp, src_basereg
- pixdeinterleave mask_bpp, mask_basereg
- pixdeinterleave dst_r_bpp, dst_r_basereg
-
-- process_pixblock_head
--.if cache_preload_flag != 0
-+ \process_pixblock_head
-+.if \cache_preload_flag != 0
- cache_preload 0, pixblock_size
- cache_preload_simple
- .endif
-- process_pixblock_tail
-+ \process_pixblock_tail
- pixinterleave dst_w_bpp, dst_w_basereg
- .irp chunk_size, 16, 8, 4, 2, 1
--.if pixblock_size > chunk_size
-- tst W, #chunk_size
-+.if pixblock_size > \chunk_size
-+ tst W, #\chunk_size
- beq 51f
--.if dst_aligned_flag != 0
-- pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
-+.if \dst_aligned_flag != 0
-+ pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
- .else
-- pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
-+ pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
- .endif
- 51:
- .endif
- .endr
- 52:
- .endm
-
- /*
-@@ -655,17 +655,17 @@ 52:
- .if (src_bpp != 24) && (src_bpp != 0)
- sub SRC, SRC, W, lsl #src_bpp_shift
- .endif
- .if (mask_bpp != 24) && (mask_bpp != 0)
- sub MASK, MASK, W, lsl #mask_bpp_shift
- .endif
- subs H, H, #1
- mov DST_R, DST_W
-- bge start_of_loop_label
-+ bge \start_of_loop_label
- .endm
-
- /*
- * Registers are allocated in the following way by default:
- * v0, v1, v2, v3 - reserved for loading source pixel data
- * v4, v5, v6, v7 - reserved for loading destination pixel data
- * v24, v25, v26, v27 - reserved for loading mask pixel data
- * v28, v29, v30, v31 - final destination pixel data for writeback to memory
-@@ -682,17 +682,17 @@ 52:
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head, \
- dst_w_basereg_ = 28, \
- dst_r_basereg_ = 4, \
- src_basereg_ = 0, \
- mask_basereg_ = 24
-
-- pixman_asm_function fname
-+ pixman_asm_function \fname
- stp x29, x30, [sp, -16]!
- mov x29, sp
- sub sp, sp, 232 /* push all registers */
- sub x29, x29, 64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
- stp x8, x9, [x29, -80]
- stp x10, x11, [x29, -96]
-@@ -707,38 +707,38 @@ 52:
- str x28, [x29, -232]
-
- /*
- * Select prefetch type for this function. If prefetch distance is
- * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
- * has to be used instead of ADVANCED.
- */
- .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
--.if prefetch_distance == 0
-+.if \prefetch_distance == 0
- .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
- .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
-- ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
-+ ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
- .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
- .endif
-
- /*
- * Make some macro arguments globally visible and accessible
- * from other macros
- */
-- .set src_bpp, src_bpp_
-- .set mask_bpp, mask_bpp_
-- .set dst_w_bpp, dst_w_bpp_
-- .set pixblock_size, pixblock_size_
-- .set dst_w_basereg, dst_w_basereg_
-- .set dst_r_basereg, dst_r_basereg_
-- .set src_basereg, src_basereg_
-- .set mask_basereg, mask_basereg_
-+ .set src_bpp, \src_bpp_
-+ .set mask_bpp, \mask_bpp_
-+ .set dst_w_bpp, \dst_w_bpp_
-+ .set pixblock_size, \pixblock_size_
-+ .set dst_w_basereg, \dst_w_basereg_
-+ .set dst_r_basereg, \dst_r_basereg_
-+ .set src_basereg, \src_basereg_
-+ .set mask_basereg, \mask_basereg_
-
- .macro pixld_src x:vararg
-- pixld x
-+ pixld \x
- .endm
- .macro fetch_src_pixblock
- pixld_src pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
- .endm
- /*
- * Assign symbolic names to registers
- */
-@@ -805,32 +805,32 @@ 52:
- .elseif dst_w_bpp == 16
- .set dst_bpp_shift, 1
- .elseif dst_w_bpp == 8
- .set dst_bpp_shift, 0
- .else
- .error "requested dst bpp (dst_w_bpp) is not supported"
- .endif
-
--.if (((flags) & FLAG_DST_READWRITE) != 0)
-+.if (((\flags) & FLAG_DST_READWRITE) != 0)
- .set dst_r_bpp, dst_w_bpp
- .else
- .set dst_r_bpp, 0
- .endif
--.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
-+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
- .set DEINTERLEAVE_32BPP_ENABLED, 1
- .else
- .set DEINTERLEAVE_32BPP_ENABLED, 0
- .endif
-
--.if prefetch_distance < 0 || prefetch_distance > 15
-- .error "invalid prefetch distance (prefetch_distance)"
-+.if \prefetch_distance < 0 || \prefetch_distance > 15
-+ .error "invalid prefetch distance (\prefetch_distance)"
- .endif
-
-- PF mov PF_X, #0
-+ PF mov, PF_X, #0
- mov DST_R, DST_W
-
- .if src_bpp == 24
- sub SRC_STRIDE, SRC_STRIDE, W
- sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
- .endif
- .if mask_bpp == 24
- sub MASK_STRIDE, MASK_STRIDE, W
-@@ -839,71 +839,71 @@ 52:
- .if dst_w_bpp == 24
- sub DST_STRIDE, DST_STRIDE, W
- sub DST_STRIDE, DST_STRIDE, W, lsl #1
- .endif
-
- /*
- * Setup advanced prefetcher initial state
- */
-- PF mov PF_SRC, SRC
-- PF mov PF_DST, DST_R
-- PF mov PF_MASK, MASK
-- /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
-- PF lsl DUMMY, H, #4
-- PF mov PF_CTL, DUMMY
-- PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
-+ PF mov, PF_SRC, SRC
-+ PF mov, PF_DST, DST_R
-+ PF mov, PF_MASK, MASK
-+ /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
-+ PF lsl, DUMMY, H, #4
-+ PF mov, PF_CTL, DUMMY
-+ PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
-
-- init
-+ \init
- subs H, H, #1
- mov ORIG_W, W
- blt 9f
- cmp W, #(pixblock_size * 2)
- blt 800f
- /*
- * This is the start of the pipelined loop, which if optimized for
- * long scanlines
- */
- 0:
-- ensure_destination_ptr_alignment process_pixblock_head, \
-- process_pixblock_tail, \
-- process_pixblock_tail_head
-+ ensure_destination_ptr_alignment \process_pixblock_head, \
-+ \process_pixblock_tail, \
-+ \process_pixblock_tail_head
-
- /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
- pixld_a pixblock_size, dst_r_bpp, \
- (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
- fetch_src_pixblock
- pixld pixblock_size, mask_bpp, \
- (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-- PF add PF_X, PF_X, #pixblock_size
-- process_pixblock_head
-+ PF add, PF_X, PF_X, #pixblock_size
-+ \process_pixblock_head
- cache_preload 0, pixblock_size
- cache_preload_simple
- subs W, W, #(pixblock_size * 2)
- blt 200f
-
- 100:
-- process_pixblock_tail_head
-+ \process_pixblock_tail_head
- cache_preload_simple
- subs W, W, #pixblock_size
- bge 100b
-
- 200:
-- process_pixblock_tail
-+ \process_pixblock_tail
- pixst_a pixblock_size, dst_w_bpp, \
- (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
-
- /* Process the remaining trailing pixels in the scanline */
- process_trailing_pixels 1, 1, \
-- process_pixblock_head, \
-- process_pixblock_tail, \
-- process_pixblock_tail_head
-+ \process_pixblock_head, \
-+ \process_pixblock_tail, \
-+ \process_pixblock_tail_head
- advance_to_next_scanline 0b
-
-- cleanup
-+ \cleanup
- 1000:
- /* pop all registers */
- sub x29, x29, 64
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- ldp x8, x9, [x29, -80]
- ldp x10, x11, [x29, -96]
- ldp x12, x13, [x29, -112]
-@@ -920,48 +920,48 @@ 1000:
- ret /* exit */
- /*
- * This is the start of the loop, designed to process images with small width
- * (less than pixblock_size * 2 pixels). In this case neither pipelining
- * nor prefetch are used.
- */
- 800:
- .if src_bpp_shift >= 0
-- PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
-- PF prfm PREFETCH_MODE, [SRC, DUMMY]
-+ PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
-+ PF prfm, PREFETCH_MODE, [SRC, DUMMY]
- .endif
- .if dst_r_bpp != 0
-- PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
-- PF prfm PREFETCH_MODE, [DST_R, DUMMY]
-+ PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
-+ PF prfm, PREFETCH_MODE, [DST_R, DUMMY]
- .endif
- .if mask_bpp_shift >= 0
-- PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
-- PF prfm PREFETCH_MODE, [MASK, DUMMY]
-+ PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
-+ PF prfm, PREFETCH_MODE, [MASK, DUMMY]
- .endif
- /* Process exactly pixblock_size pixels if needed */
- tst W, #pixblock_size
- beq 100f
- pixld pixblock_size, dst_r_bpp, \
- (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
- fetch_src_pixblock
- pixld pixblock_size, mask_bpp, \
- (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-- process_pixblock_head
-- process_pixblock_tail
-+ \process_pixblock_head
-+ \process_pixblock_tail
- pixst pixblock_size, dst_w_bpp, \
- (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
- 100:
- /* Process the remaining trailing pixels in the scanline */
- process_trailing_pixels 0, 0, \
-- process_pixblock_head, \
-- process_pixblock_tail, \
-- process_pixblock_tail_head
-+ \process_pixblock_head, \
-+ \process_pixblock_tail, \
-+ \process_pixblock_tail_head
- advance_to_next_scanline 800b
- 9:
-- cleanup
-+ \cleanup
- /* pop all registers */
- sub x29, x29, 64
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- ldp x8, x9, [x29, -80]
- ldp x10, x11, [x29, -96]
- ldp x12, x13, [x29, -112]
- ldp x14, x15, [x29, -128]
-@@ -990,17 +990,17 @@ 9:
- .unreq DST_STRIDE
- .unreq MASK_STRIDE
- .unreq PF_CTL
- .unreq PF_X
- .unreq PF_SRC
- .unreq PF_DST
- .unreq PF_MASK
- .unreq DUMMY
-- .endfunc
-+ pixman_end_asm_function
- .endm
-
- /*
- * A simplified variant of function generation template for a single
- * scanline processing (for implementing pixman combine functions)
- */
- .macro generate_composite_function_scanline use_nearest_scaling, \
- fname, \
-@@ -1014,50 +1014,50 @@ 9:
- process_pixblock_head, \
- process_pixblock_tail, \
- process_pixblock_tail_head, \
- dst_w_basereg_ = 28, \
- dst_r_basereg_ = 4, \
- src_basereg_ = 0, \
- mask_basereg_ = 24
-
-- pixman_asm_function fname
-+ pixman_asm_function \fname
- .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
-
- /*
- * Make some macro arguments globally visible and accessible
- * from other macros
- */
-- .set src_bpp, src_bpp_
-- .set mask_bpp, mask_bpp_
-- .set dst_w_bpp, dst_w_bpp_
-- .set pixblock_size, pixblock_size_
-- .set dst_w_basereg, dst_w_basereg_
-- .set dst_r_basereg, dst_r_basereg_
-- .set src_basereg, src_basereg_
-- .set mask_basereg, mask_basereg_
-+ .set src_bpp, \src_bpp_
-+ .set mask_bpp, \mask_bpp_
-+ .set dst_w_bpp, \dst_w_bpp_
-+ .set pixblock_size, \pixblock_size_
-+ .set dst_w_basereg, \dst_w_basereg_
-+ .set dst_r_basereg, \dst_r_basereg_
-+ .set src_basereg, \src_basereg_
-+ .set mask_basereg, \mask_basereg_
-
--.if use_nearest_scaling != 0
-+.if \use_nearest_scaling != 0
- /*
- * Assign symbolic names to registers for nearest scaling
- */
- W .req x0
- DST_W .req x1
- SRC .req x2
- VX .req x3
- UNIT_X .req x4
- SRC_WIDTH_FIXED .req x5
- MASK .req x6
- TMP1 .req x8
- TMP2 .req x9
- DST_R .req x10
- DUMMY .req x30
-
- .macro pixld_src x:vararg
-- pixld_s x
-+ pixld_s \x
- .endm
-
- sxtw x0, w0
- sxtw x3, w3
- sxtw x4, w4
- sxtw x5, w5
-
- stp x29, x30, [sp, -16]!
-@@ -1075,84 +1075,84 @@ 9:
- W .req x0 /* width (is updated during processing) */
- DST_W .req x1 /* destination buffer pointer for writes */
- SRC .req x2 /* source buffer pointer */
- MASK .req x3 /* mask pointer */
- DST_R .req x4 /* destination buffer pointer for reads */
- DUMMY .req x30
-
- .macro pixld_src x:vararg
-- pixld x
-+ pixld \x
- .endm
-
- sxtw x0, w0
-
- stp x29, x30, [sp, -16]!
- mov x29, sp
- sub sp, sp, 64
- sub x29, x29, 64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- .endif
-
--.if (((flags) & FLAG_DST_READWRITE) != 0)
-+.if (((\flags) & FLAG_DST_READWRITE) != 0)
- .set dst_r_bpp, dst_w_bpp
- .else
- .set dst_r_bpp, 0
- .endif
--.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
-+.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
- .set DEINTERLEAVE_32BPP_ENABLED, 1
- .else
- .set DEINTERLEAVE_32BPP_ENABLED, 0
- .endif
-
- .macro fetch_src_pixblock
- pixld_src pixblock_size, src_bpp, \
- (src_basereg - pixblock_size * src_bpp / 64), SRC
- .endm
-
-- init
-+ \init
- mov DST_R, DST_W
-
- cmp W, #pixblock_size
- blt 800f
-
-- ensure_destination_ptr_alignment process_pixblock_head, \
-- process_pixblock_tail, \
-- process_pixblock_tail_head
-+ ensure_destination_ptr_alignment \process_pixblock_head, \
-+ \process_pixblock_tail, \
-+ \process_pixblock_tail_head
-
- subs W, W, #pixblock_size
- blt 700f
-
- /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
- pixld_a pixblock_size, dst_r_bpp, \
- (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
- fetch_src_pixblock
- pixld pixblock_size, mask_bpp, \
- (mask_basereg - pixblock_size * mask_bpp / 64), MASK
-- process_pixblock_head
-+ \process_pixblock_head
- subs W, W, #pixblock_size
- blt 200f
- 100:
-- process_pixblock_tail_head
-+ \process_pixblock_tail_head
- subs W, W, #pixblock_size
- bge 100b
- 200:
-- process_pixblock_tail
-+ \process_pixblock_tail
- pixst_a pixblock_size, dst_w_bpp, \
- (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
- 700:
- /* Process the remaining trailing pixels in the scanline (dst aligned) */
- process_trailing_pixels 0, 1, \
-- process_pixblock_head, \
-- process_pixblock_tail, \
-- process_pixblock_tail_head
-+ \process_pixblock_head, \
-+ \process_pixblock_tail, \
-+ \process_pixblock_tail_head
-
-- cleanup
--.if use_nearest_scaling != 0
-+ \cleanup
-+.if \use_nearest_scaling != 0
- sub x29, x29, 64
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- ldp x8, x9, [x29, -80]
- ldr x10, [x29, -96]
- mov sp, x29
- ldp x29, x30, [sp], 16
- ret /* exit */
-@@ -1162,22 +1162,22 @@ 700:
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- mov sp, x29
- ldp x29, x30, [sp], 16
- ret /* exit */
- .endif
- 800:
- /* Process the remaining trailing pixels in the scanline (dst unaligned) */
- process_trailing_pixels 0, 0, \
-- process_pixblock_head, \
-- process_pixblock_tail, \
-- process_pixblock_tail_head
-+ \process_pixblock_head, \
-+ \process_pixblock_tail, \
-+ \process_pixblock_tail_head
-
-- cleanup
--.if use_nearest_scaling != 0
-+ \cleanup
-+.if \use_nearest_scaling != 0
- sub x29, x29, 64
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
- ldp x8, x9, [x29, -80]
- ldr x10, [x29, -88]
- mov sp, x29
- ldp x29, x30, [sp], 16
- ret /* exit */
-@@ -1208,25 +1208,25 @@ 800:
- .unreq DST_R
- .unreq DST_W
- .unreq W
- .endif
-
- .purgem fetch_src_pixblock
- .purgem pixld_src
-
-- .endfunc
-+ pixman_end_asm_function
- .endm
-
- .macro generate_composite_function_single_scanline x:vararg
-- generate_composite_function_scanline 0, x
-+ generate_composite_function_scanline 0, \x
- .endm
-
- .macro generate_composite_function_nearest_scanline x:vararg
-- generate_composite_function_scanline 1, x
-+ generate_composite_function_scanline 1, \x
- .endm
-
- /* Default prologue/epilogue, nothing special needs to be done */
-
- .macro default_init
- .endm
-
- .macro default_cleanup
-@@ -1250,61 +1250,61 @@ 800:
- * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
- * into a planar a8r8g8b8 format (with a, r, g, b color components
- * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
- *
- * Warning: the conversion is destructive and the original
- * value (in) is lost.
- */
- .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
-- shrn &out_r&.8b, &in&.8h, #8
-- shrn &out_g&.8b, &in&.8h, #3
-- sli &in&.8h, &in&.8h, #5
-- movi &out_a&.8b, #255
-- sri &out_r&.8b, &out_r&.8b, #5
-- sri &out_g&.8b, &out_g&.8b, #6
-- shrn &out_b&.8b, &in&.8h, #2
-+ shrn \()\out_r\().8b, \()\in\().8h, #8
-+ shrn \()\out_g\().8b, \()\in\().8h, #3
-+ sli \()\in\().8h, \()\in\().8h, #5
-+ movi \()\out_a\().8b, #255
-+ sri \()\out_r\().8b, \()\out_r\().8b, #5
-+ sri \()\out_g\().8b, \()\out_g\().8b, #6
-+ shrn \()\out_b\().8b, \()\in\().8h, #2
- .endm
-
- .macro convert_0565_to_x888 in, out_r, out_g, out_b
-- shrn &out_r&.8b, &in&.8h, #8
-- shrn &out_g&.8b, &in&.8h, #3
-- sli &in&.8h, &in&.8h, #5
-- sri &out_r&.8b, &out_r&.8b, #5
-- sri &out_g&.8b, &out_g&.8b, #6
-- shrn &out_b&.8b, &in&.8h, #2
-+ shrn \()\out_r\().8b, \()\in\().8h, #8
-+ shrn \()\out_g\().8b, \()\in\().8h, #3
-+ sli \()\in\().8h, \()\in\().8h, #5
-+ sri \()\out_r\().8b, \()\out_r\().8b, #5
-+ sri \()\out_g\().8b, \()\out_g\().8b, #6
-+ shrn \()\out_b\().8b, \()\in\().8h, #2
- .endm
-
- /*
- * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
- * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
- * pixels packed in 128-bit register (out). Requires two temporary 128-bit
- * registers (tmp1, tmp2)
- */
- .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
-- ushll &tmp1&.8h, &in_g&.8b, #7
-- shl &tmp1&.8h, &tmp1&.8h, #1
-- ushll &out&.8h, &in_r&.8b, #7
-- shl &out&.8h, &out&.8h, #1
-- ushll &tmp2&.8h, &in_b&.8b, #7
-- shl &tmp2&.8h, &tmp2&.8h, #1
-- sri &out&.8h, &tmp1&.8h, #5
-- sri &out&.8h, &tmp2&.8h, #11
-+ ushll \()\tmp1\().8h, \()\in_g\().8b, #7
-+ shl \()\tmp1\().8h, \()\tmp1\().8h, #1
-+ ushll \()\out\().8h, \()\in_r\().8b, #7
-+ shl \()\out\().8h, \()\out\().8h, #1
-+ ushll \()\tmp2\().8h, \()\in_b\().8b, #7
-+ shl \()\tmp2\().8h, \()\tmp2\().8h, #1
-+ sri \()\out\().8h, \()\tmp1\().8h, #5
-+ sri \()\out\().8h, \()\tmp2\().8h, #11
- .endm
-
- /*
- * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
- * returned in (out0, out1) registers pair. Requires one temporary
- * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
- * value from 'in' is lost
- */
- .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
-- shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */
-- shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */
-- sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */
-- sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */
-- sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */
-- ushr &out1&.4h, &in&.4h, #8 /* R is in place */
-- sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */
-- zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */
-- zip2 &out1&.4h, &out0&.4h, &out1&.4h
-- mov &out0&.d[0], &tmp&.d[0]
-+ shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */
-+ shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */
-+ sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */
-+ sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */
-+ sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */
-+ ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */
-+ sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */
-+ zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */
-+ zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h
-+ mov \()\out0\().d[0], \()\tmp\().d[0]
- .endm