https://gitlab.freedesktop.org/pixman/pixman/-/issues/74 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S @@ -77,206 +77,206 @@ * format conversion, and interpolation as separate macros which can be used * as the basic building blocks for constructing bilinear scanline functions. */ .macro bilinear_load_8888 reg1, reg2, tmp mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #2 - vld1.32 {reg1}, [TMP1], STRIDE - vld1.32 {reg2}, [TMP1] + vld1.32 {\reg1}, [TMP1], STRIDE + vld1.32 {\reg2}, [TMP1] .endm .macro bilinear_load_0565 reg1, reg2, tmp mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 - vld1.32 {reg2[0]}, [TMP1], STRIDE - vld1.32 {reg2[1]}, [TMP1] - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp + vld1.32 {\reg2[0]}, [TMP1], STRIDE + vld1.32 {\reg2[1]}, [TMP1] + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp .endm .macro bilinear_load_and_vertical_interpolate_two_8888 \ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - bilinear_load_8888 reg1, reg2, tmp1 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 - bilinear_load_8888 reg3, reg4, tmp2 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + bilinear_load_8888 \reg1, \reg2, \tmp1 + vmull.u8 \acc1, \reg1, d28 + vmlal.u8 \acc1, \reg2, d29 + bilinear_load_8888 \reg3, \reg4, \tmp2 + vmull.u8 \acc2, \reg3, d28 + vmlal.u8 \acc2, \reg4, d29 .endm .macro bilinear_load_and_vertical_interpolate_four_8888 \ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi .endm .macro bilinear_load_and_vertical_interpolate_two_0565 \ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 mov TMP2, X, asr #16 add X, X, UX add TMP2, TOP, TMP2, asl #1 - vld1.32 {acc2lo[0]}, [TMP1], STRIDE - vld1.32 {acc2hi[0]}, [TMP2], STRIDE - vld1.32 {acc2lo[1]}, [TMP1] - vld1.32 {acc2hi[1]}, [TMP2] - convert_0565_to_x888 acc2, reg3, reg2, reg1 - vzip.u8 reg1, reg3 - vzip.u8 reg2, reg4 - vzip.u8 reg3, reg4 - vzip.u8 reg1, reg2 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + vld1.32 {\acc2lo[0]}, [TMP1], STRIDE + vld1.32 {\acc2hi[0]}, [TMP2], STRIDE + vld1.32 {\acc2lo[1]}, [TMP1] + vld1.32 {\acc2hi[1]}, [TMP2] + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 + vzip.u8 \reg1, \reg3 + vzip.u8 \reg2, \reg4 + vzip.u8 \reg3, \reg4 + vzip.u8 \reg1, \reg2 + vmull.u8 \acc1, \reg1, d28 + vmlal.u8 \acc1, \reg2, d29 + vmull.u8 \acc2, \reg3, d28 + vmlal.u8 \acc2, \reg4, d29 .endm .macro bilinear_load_and_vertical_interpolate_four_0565 \ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 mov TMP2, X, asr #16 add X, X, UX add TMP2, TOP, TMP2, asl #1 - vld1.32 {xacc2lo[0]}, [TMP1], STRIDE - vld1.32 {xacc2hi[0]}, [TMP2], STRIDE - vld1.32 {xacc2lo[1]}, [TMP1] - vld1.32 {xacc2hi[1]}, [TMP2] - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 + vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE + vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE + vld1.32 {\xacc2lo[1]}, [TMP1] + vld1.32 {\xacc2hi[1]}, [TMP2] + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 mov TMP2, X, asr #16 add X, X, UX add TMP2, TOP, TMP2, asl #1 - vld1.32 {yacc2lo[0]}, [TMP1], STRIDE - vzip.u8 xreg1, xreg3 - vld1.32 {yacc2hi[0]}, [TMP2], STRIDE - vzip.u8 xreg2, xreg4 - vld1.32 {yacc2lo[1]}, [TMP1] - vzip.u8 xreg3, xreg4 - vld1.32 {yacc2hi[1]}, [TMP2] - vzip.u8 xreg1, xreg2 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 - vmull.u8 xacc1, xreg1, d28 - vzip.u8 yreg1, yreg3 - vmlal.u8 xacc1, xreg2, d29 - vzip.u8 yreg2, yreg4 - vmull.u8 xacc2, xreg3, d28 - vzip.u8 yreg3, yreg4 - vmlal.u8 xacc2, xreg4, d29 - vzip.u8 yreg1, yreg2 - vmull.u8 yacc1, yreg1, d28 - vmlal.u8 yacc1, yreg2, d29 - vmull.u8 yacc2, yreg3, d28 - vmlal.u8 yacc2, yreg4, d29 + vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE + vzip.u8 \xreg1, \xreg3 + vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE + vzip.u8 \xreg2, \xreg4 + vld1.32 {\yacc2lo[1]}, [TMP1] + vzip.u8 \xreg3, \xreg4 + vld1.32 {\yacc2hi[1]}, [TMP2] + vzip.u8 \xreg1, \xreg2 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 + vmull.u8 \xacc1, \xreg1, d28 + vzip.u8 \yreg1, \yreg3 + vmlal.u8 \xacc1, \xreg2, d29 + vzip.u8 \yreg2, \yreg4 + vmull.u8 \xacc2, \xreg3, d28 + vzip.u8 \yreg3, \yreg4 + vmlal.u8 \xacc2, \xreg4, d29 + vzip.u8 \yreg1, \yreg2 + vmull.u8 \yacc1, \yreg1, d28 + vmlal.u8 \yacc1, \yreg2, d29 + vmull.u8 \yacc2, \yreg3, d28 + vmlal.u8 \yacc2, \yreg4, d29 .endm .macro bilinear_store_8888 numpix, tmp1, tmp2 -.if numpix == 4 +.if \numpix == 4 vst1.32 {d0, d1}, [OUT]! -.elseif numpix == 2 +.elseif \numpix == 2 vst1.32 {d0}, [OUT]! -.elseif numpix == 1 +.elseif \numpix == 1 vst1.32 {d0[0]}, [OUT, :32]! .else .error bilinear_store_8888 numpix is unsupported .endif .endm .macro bilinear_store_0565 numpix, tmp1, tmp2 vuzp.u8 d0, d1 vuzp.u8 d2, d3 vuzp.u8 d1, d3 vuzp.u8 d0, d2 - convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 -.if numpix == 4 + convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 +.if \numpix == 4 vst1.16 {d2}, [OUT]! -.elseif numpix == 2 +.elseif \numpix == 2 vst1.32 {d2[0]}, [OUT]! -.elseif numpix == 1 +.elseif \numpix == 1 vst1.16 {d2[0]}, [OUT]! .else .error bilinear_store_0565 numpix is unsupported .endif .endm /* * Macros for loading mask pixels into register 'mask'. * vdup must be done in somewhere else. */ .macro bilinear_load_mask_x numpix, mask .endm .macro bilinear_load_mask_8 numpix, mask -.if numpix == 4 - vld1.32 {mask[0]}, [MASK]! -.elseif numpix == 2 - vld1.16 {mask[0]}, [MASK]! -.elseif numpix == 1 - vld1.8 {mask[0]}, [MASK]! +.if \numpix == 4 + vld1.32 {\mask[0]}, [MASK]! +.elseif \numpix == 2 + vld1.16 {\mask[0]}, [MASK]! +.elseif \numpix == 1 + vld1.8 {\mask[0]}, [MASK]! .else - .error bilinear_load_mask_8 numpix is unsupported + .error bilinear_load_mask_8 \numpix is unsupported .endif pld [MASK, #prefetch_offset] .endm .macro bilinear_load_mask mask_fmt, numpix, mask - bilinear_load_mask_&mask_fmt numpix, mask + bilinear_load_mask_\()\mask_fmt \numpix, \mask .endm /* * Macros for loading destination pixels into register 'dst0' and 'dst1'. * Interleave should be done somewhere else. */ .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 .endm .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 .endm .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 -.if numpix == 4 - vld1.32 {dst0, dst1}, [OUT] -.elseif numpix == 2 - vld1.32 {dst0}, [OUT] -.elseif numpix == 1 - vld1.32 {dst0[0]}, [OUT] +.if \numpix == 4 + vld1.32 {\dst0, \dst1}, [OUT] +.elseif \numpix == 2 + vld1.32 {\dst0}, [OUT] +.elseif \numpix == 1 + vld1.32 {\dst0[0]}, [OUT] .else - .error bilinear_load_dst_8888 numpix is unsupported + .error bilinear_load_dst_8888 \numpix is unsupported .endif pld [OUT, #(prefetch_offset * 4)] .endm .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 - bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 + bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 .endm /* * Macros for duplicating partially loaded mask to fill entire register. * We will apply mask to interleaved source pixels, that is * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) * So, we need to duplicate loaded mask into whole register. @@ -285,79 +285,79 @@ * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) * We can do some optimizations for this including last pixel cases. */ .macro bilinear_duplicate_mask_x numpix, mask .endm .macro bilinear_duplicate_mask_8 numpix, mask -.if numpix == 4 - vdup.32 mask, mask[0] -.elseif numpix == 2 - vdup.16 mask, mask[0] -.elseif numpix == 1 - vdup.8 mask, mask[0] +.if \numpix == 4 + vdup.32 \mask, \mask[0] +.elseif \numpix == 2 + vdup.16 \mask, \mask[0] +.elseif \numpix == 1 + vdup.8 \mask, \mask[0] .else .error bilinear_duplicate_mask_8 is unsupported .endif .endm .macro bilinear_duplicate_mask mask_fmt, numpix, mask - bilinear_duplicate_mask_&mask_fmt numpix, mask + bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask .endm /* * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. * Interleave should be done when maks is enabled or operator is 'over'. */ .macro bilinear_interleave src0, src1, dst0, dst1 - vuzp.8 src0, src1 - vuzp.8 dst0, dst1 - vuzp.8 src0, src1 - vuzp.8 dst0, dst1 + vuzp.8 \src0, \src1 + vuzp.8 \dst0, \dst1 + vuzp.8 \src0, \src1 + vuzp.8 \dst0, \dst1 .endm .macro bilinear_interleave_src_dst_x_src \ numpix, src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_x_over \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave \src0, \src1, \dst0, \dst1 .endm .macro bilinear_interleave_src_dst_x_add \ numpix, src0, src1, src01, dst0, dst1, dst01 .endm .macro bilinear_interleave_src_dst_8_src \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave \src0, \src1, \dst0, \dst1 .endm .macro bilinear_interleave_src_dst_8_over \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave \src0, \src1, \dst0, \dst1 .endm .macro bilinear_interleave_src_dst_8_add \ numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave src0, src1, dst0, dst1 + bilinear_interleave \src0, \src1, \dst0, \dst1 .endm .macro bilinear_interleave_src_dst \ mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 - bilinear_interleave_src_dst_&mask_fmt&_&op \ - numpix, src0, src1, src01, dst0, dst1, dst01 + bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 .endm /* * Macros for applying masks to src pixels. (see combine_mask_u() function) * src, dst should be in interleaved form. * mask register should be in form (m0, m1, m2, m3). */ @@ -365,217 +365,217 @@ numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 .endm .macro bilinear_apply_mask_to_src_8 \ numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 - vmull.u8 tmp01, src0, mask - vmull.u8 tmp23, src1, mask + vmull.u8 \tmp01, \src0, \mask + vmull.u8 \tmp23, \src1, \mask /* bubbles */ - vrshr.u16 tmp45, tmp01, #8 - vrshr.u16 tmp67, tmp23, #8 + vrshr.u16 \tmp45, \tmp01, #8 + vrshr.u16 \tmp67, \tmp23, #8 /* bubbles */ - vraddhn.u16 src0, tmp45, tmp01 - vraddhn.u16 src1, tmp67, tmp23 + vraddhn.u16 \src0, \tmp45, \tmp01 + vraddhn.u16 \src1, \tmp67, \tmp23 .endm .macro bilinear_apply_mask_to_src \ mask_fmt, numpix, src0, src1, src01, mask, \ tmp01, tmp23, tmp45, tmp67 - bilinear_apply_mask_to_src_&mask_fmt \ - numpix, src0, src1, src01, mask, \ - tmp01, tmp23, tmp45, tmp67 + bilinear_apply_mask_to_src_\()\mask_fmt \ + \numpix, \src0, \src1, \src01, \mask, \ + \tmp01, \tmp23, \tmp45, \tmp67 .endm /* * Macros for combining src and destination pixels. * Interleave or not is depending on operator 'op'. */ .macro bilinear_combine_src \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 .endm .macro bilinear_combine_over \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - vdup.32 tmp8, src1[1] + vdup.32 \tmp8, \src1[1] /* bubbles */ - vmvn.8 tmp8, tmp8 + vmvn.8 \tmp8, \tmp8 /* bubbles */ - vmull.u8 tmp01, dst0, tmp8 + vmull.u8 \tmp01, \dst0, \tmp8 /* bubbles */ - vmull.u8 tmp23, dst1, tmp8 + vmull.u8 \tmp23, \dst1, \tmp8 /* bubbles */ - vrshr.u16 tmp45, tmp01, #8 - vrshr.u16 tmp67, tmp23, #8 + vrshr.u16 \tmp45, \tmp01, #8 + vrshr.u16 \tmp67, \tmp23, #8 /* bubbles */ - vraddhn.u16 dst0, tmp45, tmp01 - vraddhn.u16 dst1, tmp67, tmp23 + vraddhn.u16 \dst0, \tmp45, \tmp01 + vraddhn.u16 \dst1, \tmp67, \tmp23 /* bubbles */ - vqadd.u8 src01, dst01, src01 + vqadd.u8 \src01, \dst01, \src01 .endm .macro bilinear_combine_add \ numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - vqadd.u8 src01, dst01, src01 + vqadd.u8 \src01, \dst01, \src01 .endm .macro bilinear_combine \ op, numpix, src0, src1, src01, dst0, dst1, dst01, \ tmp01, tmp23, tmp45, tmp67, tmp8 - bilinear_combine_&op \ - numpix, src0, src1, src01, dst0, dst1, dst01, \ - tmp01, tmp23, tmp45, tmp67, tmp8 + bilinear_combine_\()\op \ + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ + \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 .endm /* * Macros for final deinterleaving of destination pixels if needed. */ .macro bilinear_deinterleave numpix, dst0, dst1, dst01 - vuzp.8 dst0, dst1 + vuzp.8 \dst0, \dst1 /* bubbles */ - vuzp.8 dst0, dst1 + vuzp.8 \dst0, \dst1 .endm .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 .endm .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 .endm .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 - bilinear_deinterleave numpix, dst0, dst1, dst01 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 - bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 + bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 .endm .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op - bilinear_load_&src_fmt d0, d1, d2 - bilinear_load_mask mask_fmt, 1, d4 - bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 + bilinear_load_\()\src_fmt d0, d1, d2 + bilinear_load_mask \mask_fmt, 1, d4 + bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9 vmull.u8 q1, d0, d28 vmlal.u8 q1, d1, d29 /* 5 cycles bubble */ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 /* 5 cycles bubble */ - bilinear_duplicate_mask mask_fmt, 1, d4 + bilinear_duplicate_mask \mask_fmt, 1, d4 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ vmovn.u16 d0, q0 /* 1 cycle bubble */ bilinear_interleave_src_dst \ - mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 + \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9 bilinear_apply_mask_to_src \ - mask_fmt, 1, d0, d1, q0, d4, \ + \mask_fmt, 1, d0, d1, q0, d4, \ q3, q8, q10, q11 bilinear_combine \ - op, 1, d0, d1, q0, d18, d19, q9, \ + \op, 1, d0, d1, q0, d18, d19, q9, \ q3, q8, q10, q11, d5 - bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 - bilinear_store_&dst_fmt 1, q2, q3 + bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0 + bilinear_store_\()\dst_fmt 1, q2, q3 .endm .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op - bilinear_load_and_vertical_interpolate_two_&src_fmt \ + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 - bilinear_load_mask mask_fmt, 2, d4 - bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 + bilinear_load_mask \mask_fmt, 2, d4 + bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) - bilinear_duplicate_mask mask_fmt, 2, d4 + bilinear_duplicate_mask \mask_fmt, 2, d4 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vmovn.u16 d0, q0 bilinear_interleave_src_dst \ - mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 + \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9 bilinear_apply_mask_to_src \ - mask_fmt, 2, d0, d1, q0, d4, \ + \mask_fmt, 2, d0, d1, q0, d4, \ q3, q8, q10, q11 bilinear_combine \ - op, 2, d0, d1, q0, d18, d19, q9, \ + \op, 2, d0, d1, q0, d18, d19, q9, \ q3, q8, q10, q11, d5 - bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 - bilinear_store_&dst_fmt 2, q2, q3 + bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0 + bilinear_store_\()\dst_fmt 2, q2, q3 .endm .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op - bilinear_load_and_vertical_interpolate_four_&src_fmt \ + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 \ q3, q9, d4, d5, d16, d17, d18, d19 pld [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q2, d6, d30 vmlal.u16 q2, d7, d30 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS - bilinear_load_mask mask_fmt, 4, d22 - bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 + bilinear_load_mask \mask_fmt, 4, d22 + bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1 pld [TMP1, PF_OFFS] vmlsl.u16 q8, d18, d31 vmlal.u16 q8, d19, d31 vadd.u16 q12, q12, q13 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) - bilinear_duplicate_mask mask_fmt, 4, d22 + bilinear_duplicate_mask \mask_fmt, 4, d22 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vmovn.u16 d0, q0 vmovn.u16 d1, q2 vadd.u16 q12, q12, q13 bilinear_interleave_src_dst \ - mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 + \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1 bilinear_apply_mask_to_src \ - mask_fmt, 4, d0, d1, q0, d22, \ + \mask_fmt, 4, d0, d1, q0, d22, \ q3, q8, q9, q10 bilinear_combine \ - op, 4, d0, d1, q0, d2, d3, q1, \ + \op, 4, d0, d1, q0, d2, d3, q1, \ q3, q8, q9, q10, d23 - bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 - bilinear_store_&dst_fmt 4, q2, q3 + bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0 + bilinear_store_\()\dst_fmt 4, q2, q3 .endm .set BILINEAR_FLAG_USE_MASK, 1 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 /* * Main template macro for generating NEON optimized bilinear scanline functions. * @@ -605,24 +605,24 @@ bilinear_process_four_pixels, \ bilinear_process_pixblock_head, \ bilinear_process_pixblock_tail, \ bilinear_process_pixblock_tail_head, \ pixblock_size, \ prefetch_distance, \ flags -pixman_asm_function fname -.if pixblock_size == 8 -.elseif pixblock_size == 4 +pixman_asm_function \fname +.if \pixblock_size == 8 +.elseif \pixblock_size == 4 .else .error unsupported pixblock size .endif -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 OUT .req r0 TOP .req r1 BOTTOM .req r2 WT .req r3 WB .req r4 X .req r5 UX .req r6 WIDTH .req ip @@ -630,17 +630,17 @@ pixman_asm_function fname TMP2 .req r4 PF_OFFS .req r7 TMP3 .req r8 TMP4 .req r9 STRIDE .req r2 mov ip, sp push {r4, r5, r6, r7, r8, r9} - mov PF_OFFS, #prefetch_distance + mov PF_OFFS, #\prefetch_distance ldmia ip, {WB, X, UX, WIDTH} .else OUT .req r0 MASK .req r1 TOP .req r2 BOTTOM .req r3 WT .req r4 WB .req r5 @@ -649,27 +649,27 @@ pixman_asm_function fname WIDTH .req ip TMP1 .req r4 TMP2 .req r5 PF_OFFS .req r8 TMP3 .req r9 TMP4 .req r10 STRIDE .req r3 - .set prefetch_offset, prefetch_distance + .set prefetch_offset, \prefetch_distance mov ip, sp push {r4, r5, r6, r7, r8, r9, r10, ip} - mov PF_OFFS, #prefetch_distance + mov PF_OFFS, #\prefetch_distance ldmia ip, {WT, WB, X, UX, WIDTH} .endif mul PF_OFFS, PF_OFFS, UX -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 vpush {d8-d15} .endif sub STRIDE, BOTTOM, TOP .unreq BOTTOM cmp WIDTH, #0 ble 3f @@ -678,76 +678,76 @@ pixman_asm_function fname vdup.u16 q13, UX vdup.u8 d28, WT vdup.u8 d29, WB vadd.u16 d25, d25, d26 /* ensure good destination alignment */ cmp WIDTH, #1 blt 0f - tst OUT, #(1 << dst_bpp_shift) + tst OUT, #(1 << \dst_bpp_shift) beq 0f vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 - bilinear_process_last_pixel + \bilinear_process_last_pixel sub WIDTH, WIDTH, #1 0: vadd.u16 q13, q13, q13 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 cmp WIDTH, #2 blt 0f - tst OUT, #(1 << (dst_bpp_shift + 1)) + tst OUT, #(1 << (\dst_bpp_shift + 1)) beq 0f - bilinear_process_two_pixels + \bilinear_process_two_pixels sub WIDTH, WIDTH, #2 0: -.if pixblock_size == 8 +.if \pixblock_size == 8 cmp WIDTH, #4 blt 0f - tst OUT, #(1 << (dst_bpp_shift + 2)) + tst OUT, #(1 << (\dst_bpp_shift + 2)) beq 0f - bilinear_process_four_pixels + \bilinear_process_four_pixels sub WIDTH, WIDTH, #4 0: .endif - subs WIDTH, WIDTH, #pixblock_size + subs WIDTH, WIDTH, #\pixblock_size blt 1f - mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) - bilinear_process_pixblock_head - subs WIDTH, WIDTH, #pixblock_size + mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift) + \bilinear_process_pixblock_head + subs WIDTH, WIDTH, #\pixblock_size blt 5f 0: - bilinear_process_pixblock_tail_head - subs WIDTH, WIDTH, #pixblock_size + \bilinear_process_pixblock_tail_head + subs WIDTH, WIDTH, #\pixblock_size bge 0b 5: - bilinear_process_pixblock_tail + \bilinear_process_pixblock_tail 1: -.if pixblock_size == 8 +.if \pixblock_size == 8 tst WIDTH, #4 beq 2f - bilinear_process_four_pixels + \bilinear_process_four_pixels 2: .endif /* handle the remaining trailing pixels */ tst WIDTH, #2 beq 2f - bilinear_process_two_pixels + \bilinear_process_two_pixels 2: tst WIDTH, #1 beq 3f - bilinear_process_last_pixel + \bilinear_process_last_pixel 3: -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 vpop {d8-d15} .endif -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 pop {r4, r5, r6, r7, r8, r9} .else pop {r4, r5, r6, r7, r8, r9, r10, ip} .endif bx lr .unreq OUT .unreq TOP @@ -757,21 +757,21 @@ 3: .unreq UX .unreq WIDTH .unreq TMP1 .unreq TMP2 .unreq PF_OFFS .unreq TMP3 .unreq TMP4 .unreq STRIDE -.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 .unreq MASK .endif -.endfunc +pixman_end_asm_function .endm /* src_8888_8_8888 */ .macro bilinear_src_8888_8_8888_process_last_pixel bilinear_interpolate_last_pixel 8888, 8, 8888, src .endm diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S @@ -29,16 +29,22 @@ * (those which are exposing some new or interesting features) are * extensively commented and can be used as examples. * * You may want to have a look at the comments for following functions: * - pixman_composite_over_8888_0565_asm_neon * - pixman_composite_over_n_8_0565_asm_neon */ +#ifdef __clang__ +#define ldrgeb ldrbge +#define subges subsge +#define subpls subspl +#endif + /* Prevent the stack from becoming executable for no reason... */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .fpu neon .arch armv7a @@ -255,43 +261,43 @@ vqadd.u8 d16, d2, d20 vld1.16 {d4, d5}, [DST_R, :128]! vqadd.u8 q9, q0, q11 vshrn.u16 d6, q2, #8 fetch_src_pixblock vshrn.u16 d7, q2, #3 vsli.u16 q2, q2, #5 vshll.u8 q14, d16, #8 - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 vshll.u8 q8, d19, #8 - PF tst PF_CTL, #0xF + PF tst, PF_CTL, #0xF vsri.u8 d6, d6, #5 - PF addne PF_X, PF_X, #8 + PF addne, PF_X, PF_X, #8 vmvn.8 d3, d3 - PF subne PF_CTL, PF_CTL, #1 + PF subne, PF_CTL, PF_CTL, #1 vsri.u8 d7, d7, #6 vshrn.u16 d30, q2, #2 vmull.u8 q10, d3, d6 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vmull.u8 q11, d3, d7 vmull.u8 q12, d3, d30 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vsri.u16 q14, q8, #5 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vshll.u8 q9, d18, #8 vrshr.u16 q13, q10, #8 - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vrshr.u16 q3, q11, #8 vrshr.u16 q15, q12, #8 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vsri.u16 q14, q9, #11 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vraddhn.u16 d20, q10, q13 vraddhn.u16 d23, q11, q3 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vraddhn.u16 d22, q12, q15 vst1.16 {d28, d29}, [DST_W, :128]! .endm #else /* If we did not care much about the performance, we would just use this... */ .macro pixman_composite_over_8888_0565_process_pixblock_tail_head @@ -429,30 +435,30 @@ generate_composite_function \ .macro pixman_composite_src_8888_0565_process_pixblock_tail vsri.u16 q14, q8, #5 vsri.u16 q14, q9, #11 .endm .macro pixman_composite_src_8888_0565_process_pixblock_tail_head vsri.u16 q14, q8, #5 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF fetch_src_pixblock - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vsri.u16 q14, q9, #11 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vshll.u8 q8, d1, #8 vst1.16 {d28, d29}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 + PF subge, PF_X, PF_X, ORIG_W + PF subges, PF_CTL, PF_CTL, #0x10 vshll.u8 q14, d2, #8 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vshll.u8 q9, d0, #8 .endm generate_composite_function \ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ @@ -504,30 +510,30 @@ generate_composite_function \ vqadd.u8 q15, q1, q3 .endm .macro pixman_composite_add_8_8_process_pixblock_tail .endm .macro pixman_composite_add_8_8_process_pixblock_tail_head fetch_src_pixblock - PF add PF_X, PF_X, #32 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #32 + PF tst, PF_CTL, #0xF vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #32 - PF subne PF_CTL, PF_CTL, #1 + PF addne, PF_X, PF_X, #32 + PF subne, PF_CTL, PF_CTL, #1 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 + PF subge, PF_X, PF_X, ORIG_W + PF subges, PF_CTL, PF_CTL, #0x10 vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vqadd.u8 q15, q1, q3 .endm generate_composite_function \ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ FLAG_DST_READWRITE, \ 32, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ @@ -536,30 +542,30 @@ generate_composite_function \ pixman_composite_add_8_8_process_pixblock_head, \ pixman_composite_add_8_8_process_pixblock_tail, \ pixman_composite_add_8_8_process_pixblock_tail_head /******************************************************************************/ .macro pixman_composite_add_8888_8888_process_pixblock_tail_head fetch_src_pixblock - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 + PF subge, PF_X, PF_X, ORIG_W + PF subges, PF_CTL, PF_CTL, #0x10 vqadd.u8 q14, q0, q2 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vqadd.u8 q15, q1, q3 .endm generate_composite_function \ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE, \ 8, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ @@ -599,40 +605,40 @@ generate_composite_function_single_scanl vraddhn.u16 d29, q15, q9 vraddhn.u16 d30, q12, q10 vraddhn.u16 d31, q13, q11 .endm .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF vrshr.u16 q15, q9, #8 vrshr.u16 q12, q10, #8 vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vraddhn.u16 d28, q14, q8 vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vraddhn.u16 d30, q12, q10 vraddhn.u16 d31, q13, q11 fetch_src_pixblock PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vmvn.8 d22, d3 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 + PF subsge, PF_CTL, PF_CTL, #0x10 vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vmull.u8 q11, d22, d7 .endm generate_composite_function_single_scanline \ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ default_init, \ @@ -651,42 +657,42 @@ generate_composite_function_single_scanl pixman_composite_out_reverse_8888_8888_process_pixblock_tail vqadd.u8 q14, q0, q14 vqadd.u8 q15, q1, q15 .endm .macro pixman_composite_over_8888_8888_process_pixblock_tail_head vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF vrshr.u16 q15, q9, #8 vrshr.u16 q12, q10, #8 vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vraddhn.u16 d28, q14, q8 vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vraddhn.u16 d30, q12, q10 vraddhn.u16 d31, q13, q11 vqadd.u8 q14, q0, q14 vqadd.u8 q15, q1, q15 fetch_src_pixblock PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] vmvn.8 d22, d3 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vmull.u8 q9, d22, d5 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vmull.u8 q11, d22, d7 .endm generate_composite_function \ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 5, /* prefetch distance */ \ @@ -737,30 +743,30 @@ generate_composite_function_single_scanl vrshr.u16 q2, q10, #8 vrshr.u16 q3, q11, #8 vraddhn.u16 d28, q14, q8 vraddhn.u16 d29, q15, q9 vraddhn.u16 d30, q2, q10 vraddhn.u16 d31, q3, q11 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! vqadd.u8 q14, q0, q14 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0x0F - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0x0F + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vqadd.u8 q15, q1, q15 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vmull.u8 q8, d24, d4 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vmull.u8 q9, d24, d5 - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q10, d24, d6 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vmull.u8 q11, d24, d7 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! .endm .macro pixman_composite_over_n_8888_init add DUMMY, sp, #ARGS_STACK_OFFSET vld1.32 {d3[0]}, [DUMMY] vdup.8 d0, d3[0] vdup.8 d1, d3[1] @@ -779,40 +785,40 @@ generate_composite_function \ pixman_composite_over_8888_8888_process_pixblock_head, \ pixman_composite_over_8888_8888_process_pixblock_tail, \ pixman_composite_over_n_8888_process_pixblock_tail_head /******************************************************************************/ .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head vrshr.u16 q14, q8, #8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF vrshr.u16 q15, q9, #8 vrshr.u16 q12, q10, #8 vrshr.u16 q13, q11, #8 - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vraddhn.u16 d28, q14, q8 vraddhn.u16 d29, q15, q9 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vraddhn.u16 d30, q12, q10 vraddhn.u16 d31, q13, q11 vqadd.u8 q14, q0, q14 vqadd.u8 q15, q1, q15 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! vmvn.8 d22, d3 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q8, d22, d4 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vmull.u8 q9, d22, d5 vmull.u8 q10, d22, d6 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vmull.u8 q11, d22, d7 .endm .macro pixman_composite_over_reverse_n_8888_init add DUMMY, sp, #ARGS_STACK_OFFSET vld1.32 {d7[0]}, [DUMMY] vdup.8 d4, d7[0] vdup.8 d5, d7[1] @@ -1240,33 +1246,33 @@ generate_composite_function \ vrshrn.u16 d28, q8, #8 vrshrn.u16 d29, q9, #8 vrshrn.u16 d30, q10, #8 vrshrn.u16 d31, q11, #8 .endm .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head fetch_mask_pixblock - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 vrshrn.u16 d28, q8, #8 - PF tst PF_CTL, #0x0F + PF tst, PF_CTL, #0x0F vrshrn.u16 d29, q9, #8 - PF addne PF_X, PF_X, #8 + PF addne, PF_X, PF_X, #8 vrshrn.u16 d30, q10, #8 - PF subne PF_CTL, PF_CTL, #1 + PF subne, PF_CTL, PF_CTL, #1 vrshrn.u16 d31, q11, #8 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vmull.u8 q8, d24, d0 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] vmull.u8 q9, d24, d1 - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q10, d24, d2 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vmull.u8 q11, d24, d3 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! vrsra.u16 q8, q8, #8 vrsra.u16 q9, q9, #8 vrsra.u16 q10, q10, #8 vrsra.u16 q11, q11, #8 .endm .macro pixman_composite_src_n_8_8888_init @@ -1309,33 +1315,33 @@ generate_composite_function \ vrshrn.u16 d28, q0, #8 vrshrn.u16 d29, q1, #8 vrshrn.u16 d30, q2, #8 vrshrn.u16 d31, q3, #8 .endm .macro pixman_composite_src_n_8_8_process_pixblock_tail_head fetch_mask_pixblock - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 vrshrn.u16 d28, q0, #8 - PF tst PF_CTL, #0x0F + PF tst, PF_CTL, #0x0F vrshrn.u16 d29, q1, #8 - PF addne PF_X, PF_X, #8 + PF addne, PF_X, PF_X, #8 vrshrn.u16 d30, q2, #8 - PF subne PF_CTL, PF_CTL, #1 + PF subne, PF_CTL, PF_CTL, #1 vrshrn.u16 d31, q3, #8 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vmull.u8 q0, d24, d16 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] vmull.u8 q1, d25, d16 - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q2, d26, d16 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vmull.u8 q3, d27, d16 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! vrsra.u16 q0, q0, #8 vrsra.u16 q1, q1, #8 vrsra.u16 q2, q2, #8 vrsra.u16 q3, q3, #8 .endm .macro pixman_composite_src_n_8_8_init @@ -1403,37 +1409,37 @@ generate_composite_function \ .endm .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head vrshr.u16 q14, q8, #8 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! vrshr.u16 q15, q9, #8 fetch_mask_pixblock vrshr.u16 q6, q10, #8 - PF add PF_X, PF_X, #8 + PF add, PF_X, PF_X, #8 vrshr.u16 q7, q11, #8 - PF tst PF_CTL, #0x0F + PF tst, PF_CTL, #0x0F vraddhn.u16 d28, q14, q8 - PF addne PF_X, PF_X, #8 + PF addne, PF_X, PF_X, #8 vraddhn.u16 d29, q15, q9 - PF subne PF_CTL, PF_CTL, #1 + PF subne, PF_CTL, PF_CTL, #1 vraddhn.u16 d30, q6, q10 - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W vraddhn.u16 d31, q7, q11 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] vmull.u8 q6, d24, d8 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] vmull.u8 q7, d24, d9 - PF subge PF_X, PF_X, ORIG_W + PF subge, PF_X, PF_X, ORIG_W vmull.u8 q8, d24, d10 - PF subges PF_CTL, PF_CTL, #0x10 + PF subges, PF_CTL, PF_CTL, #0x10 vmull.u8 q9, d24, d11 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! vqadd.u8 q14, q0, q14 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! vqadd.u8 q15, q1, q15 vrshr.u16 q10, q6, #8 vrshr.u16 q11, q7, #8 vrshr.u16 q12, q8, #8 vrshr.u16 q13, q9, #8 vraddhn.u16 d0, q6, q10 vraddhn.u16 d1, q7, q11 vraddhn.u16 d2, q8, q12 @@ -2420,31 +2426,31 @@ generate_composite_function \ .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head vrshr.u16 q11, q8, #8 vswp d3, d31 vrshr.u16 q12, q9, #8 vrshr.u16 q13, q10, #8 fetch_src_pixblock vraddhn.u16 d30, q11, q8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vraddhn.u16 d29, q12, q9 vraddhn.u16 d28, q13, q10 vmull.u8 q8, d3, d0 vmull.u8 q9, d3, d1 vmull.u8 q10, d3, d2 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF subge, PF_X, PF_X, ORIG_W + PF subges, PF_CTL, PF_CTL, #0x10 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! .endm generate_composite_function \ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ default_init, \ @@ -2477,31 +2483,31 @@ generate_composite_function \ .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head vrshr.u16 q11, q8, #8 vswp d3, d31 vrshr.u16 q12, q9, #8 vrshr.u16 q13, q10, #8 fetch_src_pixblock vraddhn.u16 d28, q11, q8 - PF add PF_X, PF_X, #8 - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #8 - PF subne PF_CTL, PF_CTL, #1 + PF add, PF_X, PF_X, #8 + PF tst, PF_CTL, #0xF + PF addne, PF_X, PF_X, #8 + PF subne, PF_CTL, PF_CTL, #1 vraddhn.u16 d29, q12, q9 vraddhn.u16 d30, q13, q10 vmull.u8 q8, d3, d0 vmull.u8 q9, d3, d1 vmull.u8 q10, d3, d2 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! - PF cmp PF_X, ORIG_W + PF cmp, PF_X, ORIG_W PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF subge, PF_X, PF_X, ORIG_W + PF subges, PF_CTL, PF_CTL, #0x10 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! .endm generate_composite_function \ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 8, /* number of pixels, processed in a single block */ \ 10, /* prefetch distance */ \ default_init, \ @@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan * format conversion, and interpolation as separate macros which can be used * as the basic building blocks for constructing bilinear scanline functions. */ .macro bilinear_load_8888 reg1, reg2, tmp mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #2 - vld1.32 {reg1}, [TMP1], STRIDE - vld1.32 {reg2}, [TMP1] + vld1.32 {\reg1}, [TMP1], STRIDE + vld1.32 {\reg2}, [TMP1] .endm .macro bilinear_load_0565 reg1, reg2, tmp mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 - vld1.32 {reg2[0]}, [TMP1], STRIDE - vld1.32 {reg2[1]}, [TMP1] - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp + vld1.32 {\reg2[0]}, [TMP1], STRIDE + vld1.32 {\reg2[1]}, [TMP1] + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp .endm .macro bilinear_load_and_vertical_interpolate_two_8888 \ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 - bilinear_load_8888 reg1, reg2, tmp1 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 - bilinear_load_8888 reg3, reg4, tmp2 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + bilinear_load_8888 \reg1, \reg2, \tmp1 + vmull.u8 \acc1, \reg1, d28 + vmlal.u8 \acc1, \reg2, d29 + bilinear_load_8888 \reg3, \reg4, \tmp2 + vmull.u8 \acc2, \reg3, d28 + vmlal.u8 \acc2, \reg4, d29 .endm .macro bilinear_load_and_vertical_interpolate_four_8888 \ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi bilinear_load_and_vertical_interpolate_two_8888 \ - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi .endm .macro bilinear_load_and_vertical_interpolate_two_0565 \ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 mov TMP2, X, asr #16 add X, X, UX add TMP2, TOP, TMP2, asl #1 - vld1.32 {acc2lo[0]}, [TMP1], STRIDE - vld1.32 {acc2hi[0]}, [TMP2], STRIDE - vld1.32 {acc2lo[1]}, [TMP1] - vld1.32 {acc2hi[1]}, [TMP2] - convert_0565_to_x888 acc2, reg3, reg2, reg1 - vzip.u8 reg1, reg3 - vzip.u8 reg2, reg4 - vzip.u8 reg3, reg4 - vzip.u8 reg1, reg2 - vmull.u8 acc1, reg1, d28 - vmlal.u8 acc1, reg2, d29 - vmull.u8 acc2, reg3, d28 - vmlal.u8 acc2, reg4, d29 + vld1.32 {\acc2lo[0]}, [TMP1], STRIDE + vld1.32 {\acc2hi[0]}, [TMP2], STRIDE + vld1.32 {\acc2lo[1]}, [TMP1] + vld1.32 {\acc2hi[1]}, [TMP2] + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 + vzip.u8 \reg1, \reg3 + vzip.u8 \reg2, \reg4 + vzip.u8 \reg3, \reg4 + vzip.u8 \reg1, \reg2 + vmull.u8 \acc1, \reg1, d28 + vmlal.u8 \acc1, \reg2, d29 + vmull.u8 \acc2, \reg3, d28 + vmlal.u8 \acc2, \reg4, d29 .endm .macro bilinear_load_and_vertical_interpolate_four_0565 \ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 mov TMP2, X, asr #16 add X, X, UX add TMP2, TOP, TMP2, asl #1 - vld1.32 {xacc2lo[0]}, [TMP1], STRIDE - vld1.32 {xacc2hi[0]}, [TMP2], STRIDE - vld1.32 {xacc2lo[1]}, [TMP1] - vld1.32 {xacc2hi[1]}, [TMP2] - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 + vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE + vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE + vld1.32 {\xacc2lo[1]}, [TMP1] + vld1.32 {\xacc2hi[1]}, [TMP2] + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 mov TMP1, X, asr #16 add X, X, UX add TMP1, TOP, TMP1, asl #1 mov TMP2, X, asr #16 add X, X, UX add TMP2, TOP, TMP2, asl #1 - vld1.32 {yacc2lo[0]}, [TMP1], STRIDE - vzip.u8 xreg1, xreg3 - vld1.32 {yacc2hi[0]}, [TMP2], STRIDE - vzip.u8 xreg2, xreg4 - vld1.32 {yacc2lo[1]}, [TMP1] - vzip.u8 xreg3, xreg4 - vld1.32 {yacc2hi[1]}, [TMP2] - vzip.u8 xreg1, xreg2 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 - vmull.u8 xacc1, xreg1, d28 - vzip.u8 yreg1, yreg3 - vmlal.u8 xacc1, xreg2, d29 - vzip.u8 yreg2, yreg4 - vmull.u8 xacc2, xreg3, d28 - vzip.u8 yreg3, yreg4 - vmlal.u8 xacc2, xreg4, d29 - vzip.u8 yreg1, yreg2 - vmull.u8 yacc1, yreg1, d28 - vmlal.u8 yacc1, yreg2, d29 - vmull.u8 yacc2, yreg3, d28 - vmlal.u8 yacc2, yreg4, d29 + vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE + vzip.u8 \xreg1, \xreg3 + vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE + vzip.u8 \xreg2, \xreg4 + vld1.32 {\yacc2lo[1]}, [TMP1] + vzip.u8 \xreg3, \xreg4 + vld1.32 {\yacc2hi[1]}, [TMP2] + vzip.u8 \xreg1, \xreg2 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 + vmull.u8 \xacc1, \xreg1, d28 + vzip.u8 \yreg1, \yreg3 + vmlal.u8 \xacc1, \xreg2, d29 + vzip.u8 \yreg2, \yreg4 + vmull.u8 \xacc2, \xreg3, d28 + vzip.u8 \yreg3, \yreg4 + vmlal.u8 \xacc2, \xreg4, d29 + vzip.u8 \yreg1, \yreg2 + vmull.u8 \yacc1, \yreg1, d28 + vmlal.u8 \yacc1, \yreg2, d29 + vmull.u8 \yacc2, \yreg3, d28 + vmlal.u8 \yacc2, \yreg4, d29 .endm .macro bilinear_store_8888 numpix, tmp1, tmp2 -.if numpix == 4 +.if \numpix == 4 vst1.32 {d0, d1}, [OUT, :128]! -.elseif numpix == 2 +.elseif \numpix == 2 vst1.32 {d0}, [OUT, :64]! -.elseif numpix == 1 +.elseif \numpix == 1 vst1.32 {d0[0]}, [OUT, :32]! .else - .error bilinear_store_8888 numpix is unsupported + .error bilinear_store_8888 \numpix is unsupported .endif .endm .macro bilinear_store_0565 numpix, tmp1, tmp2 vuzp.u8 d0, d1 vuzp.u8 d2, d3 vuzp.u8 d1, d3 vuzp.u8 d0, d2 - convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 -.if numpix == 4 + convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 +.if \numpix == 4 vst1.16 {d2}, [OUT, :64]! -.elseif numpix == 2 +.elseif \numpix == 2 vst1.32 {d2[0]}, [OUT, :32]! -.elseif numpix == 1 +.elseif \numpix == 1 vst1.16 {d2[0]}, [OUT, :16]! .else - .error bilinear_store_0565 numpix is unsupported + .error bilinear_store_0565 \numpix is unsupported .endif .endm .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt - bilinear_load_&src_fmt d0, d1, d2 + bilinear_load_\()\src_fmt d0, d1, d2 vmull.u8 q1, d0, d28 vmlal.u8 q1, d1, d29 /* 5 cycles bubble */ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 /* 5 cycles bubble */ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) /* 3 cycles bubble */ vmovn.u16 d0, q0 /* 1 cycle bubble */ - bilinear_store_&dst_fmt 1, q2, q3 + bilinear_store_\()\dst_fmt 1, q2, q3 .endm .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt - bilinear_load_and_vertical_interpolate_two_&src_fmt \ + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q10, d22, d31 vmlal.u16 q10, d23, d31 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 vmovn.u16 d0, q0 - bilinear_store_&dst_fmt 2, q2, q3 + bilinear_store_\()\dst_fmt 2, q2, q3 .endm .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt - bilinear_load_and_vertical_interpolate_four_&src_fmt \ + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ q1, q11, d0, d1, d20, d21, d22, d23 \ q3, q9, d4, d5, d16, d17, d18, d19 pld [TMP1, PF_OFFS] sub TMP1, TMP1, STRIDE vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS vmlsl.u16 q0, d2, d30 vmlal.u16 q0, d3, d30 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS @@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vmovn.u16 d0, q0 vmovn.u16 d1, q2 vadd.u16 q12, q12, q13 - bilinear_store_&dst_fmt 4, q2, q3 + bilinear_store_\()\dst_fmt 4, q2, q3 .endm .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head .else - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail .endif .endm .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head .else - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head .else - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail .else - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt .endif .endm .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head .else - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt .endif .endm .set BILINEAR_FLAG_UNROLL_4, 0 .set BILINEAR_FLAG_UNROLL_8, 1 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 /* @@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan * prefetch_distance - prefetch in the source image by that many * pixels ahead */ .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ src_bpp_shift, dst_bpp_shift, \ prefetch_distance, flags -pixman_asm_function fname +pixman_asm_function \fname OUT .req r0 TOP .req r1 BOTTOM .req r2 WT .req r3 WB .req r4 X .req r5 UX .req r6 WIDTH .req ip @@ -3119,21 +3125,21 @@ pixman_asm_function fname TMP2 .req r4 PF_OFFS .req r7 TMP3 .req r8 TMP4 .req r9 STRIDE .req r2 mov ip, sp push {r4, r5, r6, r7, r8, r9} - mov PF_OFFS, #prefetch_distance + mov PF_OFFS, #\prefetch_distance ldmia ip, {WB, X, UX, WIDTH} mul PF_OFFS, PF_OFFS, UX -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 vpush {d8-d15} .endif sub STRIDE, BOTTOM, TOP .unreq BOTTOM cmp WIDTH, #0 ble 3f @@ -3146,83 +3152,83 @@ pixman_asm_function fname /* ensure good destination alignment */ cmp WIDTH, #1 blt 0f tst OUT, #(1 << dst_bpp_shift) beq 0f vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 - bilinear_interpolate_last_pixel src_fmt, dst_fmt + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt sub WIDTH, WIDTH, #1 0: vadd.u16 q13, q13, q13 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) vadd.u16 q12, q12, q13 cmp WIDTH, #2 blt 0f tst OUT, #(1 << (dst_bpp_shift + 1)) beq 0f - bilinear_interpolate_two_pixels src_fmt, dst_fmt + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt sub WIDTH, WIDTH, #2 0: -.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 +.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 /*********** 8 pixels per iteration *****************/ cmp WIDTH, #4 blt 0f tst OUT, #(1 << (dst_bpp_shift + 2)) beq 0f - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt sub WIDTH, WIDTH, #4 0: subs WIDTH, WIDTH, #8 blt 1f mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) - bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt + bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #8 blt 5f 0: - bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #8 bge 0b 5: - bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt + bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt 1: tst WIDTH, #4 beq 2f - bilinear_interpolate_four_pixels src_fmt, dst_fmt + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 2: .else /*********** 4 pixels per iteration *****************/ subs WIDTH, WIDTH, #4 blt 1f mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #4 blt 5f 0: - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt subs WIDTH, WIDTH, #4 bge 0b 5: - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt 1: /****************************************************/ .endif /* handle the remaining trailing pixels */ tst WIDTH, #2 beq 2f - bilinear_interpolate_two_pixels src_fmt, dst_fmt + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt 2: tst WIDTH, #1 beq 3f - bilinear_interpolate_last_pixel src_fmt, dst_fmt + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt 3: -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 vpop {d8-d15} .endif pop {r4, r5, r6, r7, r8, r9} bx lr .unreq OUT .unreq TOP .unreq WT @@ -3231,17 +3237,17 @@ 3: .unreq UX .unreq WIDTH .unreq TMP1 .unreq TMP2 .unreq PF_OFFS .unreq TMP3 .unreq TMP4 .unreq STRIDE -.endfunc +pixman_end_asm_function .endm /*****************************************************************************/ .set have_bilinear_interpolate_four_pixels_8888_8888, 1 .macro bilinear_interpolate_four_pixels_8888_8888_head diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h @@ -69,303 +69,303 @@ .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ /* * Definitions of supplementary pixld/pixst macros (for partial load/store of * pixel data). */ .macro pixldst1 op, elem_size, reg1, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! +.if \abits > 0 + \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]! .else - op&.&elem_size {d®1}, [&mem_operand&]! + \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]! .endif .endm .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! +.if \abits > 0 + \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]! .else - op&.&elem_size {d®1, d®2}, [&mem_operand&]! + \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]! .endif .endm .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! +.if \abits > 0 + \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]! .else - op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! + \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]! .endif .endm .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits - op&.&elem_size {d®1[idx]}, [&mem_operand&]! + \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]! .endm .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand - op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! + \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]! .endm .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand - op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! + \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]! .endm .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits -.if numbytes == 32 - pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif numbytes == 16 - pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits -.elseif numbytes == 8 - pixldst1 op, elem_size, %(basereg+1), mem_operand, abits -.elseif numbytes == 4 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) - pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits - .elseif elem_size == 16 - pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits - pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits +.if \numbytes == 32 + pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits +.elseif \numbytes == 16 + pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits +.elseif \numbytes == 8 + pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits +.elseif \numbytes == 4 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32) + pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits + .elseif \elem_size == 16 + pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits + pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits .else - pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits + pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits + pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits + pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits + pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits .endif -.elseif numbytes == 2 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) - pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits +.elseif \numbytes == 2 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16) + pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits .else - pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits + pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits + pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits .endif -.elseif numbytes == 1 - pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits +.elseif \numbytes == 1 + pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits .else - .error "unsupported size: numbytes" + .error "unsupported size: \numbytes" .endif .endm .macro pixld numpix, bpp, basereg, mem_operand, abits=0 -.if bpp > 0 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif (bpp == 24) && (numpix == 8) - pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand -.elseif (bpp == 24) && (numpix == 4) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand -.elseif (bpp == 24) && (numpix == 2) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand -.elseif (bpp == 24) && (numpix == 1) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.if \bpp > 0 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits +.elseif (\bpp == 24) && (\numpix == 8) + pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand +.elseif (\bpp == 24) && (\numpix == 4) + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand +.elseif (\bpp == 24) && (\numpix == 2) + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand +.elseif (\bpp == 24) && (\numpix == 1) + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand .else - pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits + pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits .endif .endif .endm .macro pixst numpix, bpp, basereg, mem_operand, abits=0 -.if bpp > 0 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif (bpp == 24) && (numpix == 8) - pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand -.elseif (bpp == 24) && (numpix == 4) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand -.elseif (bpp == 24) && (numpix == 2) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand -.elseif (bpp == 24) && (numpix == 1) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.if \bpp > 0 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \ + %(\basereg+6), %(\basereg+7), \mem_operand, \abits +.elseif (\bpp == 24) && (\numpix == 8) + pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand +.elseif (\bpp == 24) && (\numpix == 4) + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand +.elseif (\bpp == 24) && (\numpix == 2) + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand +.elseif (\bpp == 24) && (\numpix == 1) + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand .else - pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits + pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits .endif .endif .endm .macro pixld_a numpix, bpp, basereg, mem_operand -.if (bpp * numpix) <= 128 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.if (\bpp * \numpix) <= 128 + pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) .else - pixld numpix, bpp, basereg, mem_operand, 128 + pixld \numpix, \bpp, \basereg, \mem_operand, 128 .endif .endm .macro pixst_a numpix, bpp, basereg, mem_operand -.if (bpp * numpix) <= 128 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.if (\bpp * \numpix) <= 128 + pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix) .else - pixst numpix, bpp, basereg, mem_operand, 128 + pixst \numpix, \bpp, \basereg, \mem_operand, 128 .endif .endm /* * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register * aliases to be defined) */ .macro pixld1_s elem_size, reg1, mem_operand -.if elem_size == 16 +.if \elem_size == 16 mov TMP1, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP1, mem_operand, TMP1, asl #1 + add TMP1, \mem_operand, TMP1, asl #1 mov TMP2, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP2, mem_operand, TMP2, asl #1 - vld1.16 {d®1&[0]}, [TMP1, :16] + add TMP2, \mem_operand, TMP2, asl #1 + vld1.16 {d\()\reg1\()[0]}, [TMP1, :16] mov TMP1, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP1, mem_operand, TMP1, asl #1 - vld1.16 {d®1&[1]}, [TMP2, :16] + add TMP1, \mem_operand, TMP1, asl #1 + vld1.16 {d\()\reg1\()[1]}, [TMP2, :16] mov TMP2, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP2, mem_operand, TMP2, asl #1 - vld1.16 {d®1&[2]}, [TMP1, :16] - vld1.16 {d®1&[3]}, [TMP2, :16] -.elseif elem_size == 32 + add TMP2, \mem_operand, TMP2, asl #1 + vld1.16 {d\()\reg1\()[2]}, [TMP1, :16] + vld1.16 {d\()\reg1\()[3]}, [TMP2, :16] +.elseif \elem_size == 32 mov TMP1, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP1, mem_operand, TMP1, asl #2 + add TMP1, \mem_operand, TMP1, asl #2 mov TMP2, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP2, mem_operand, TMP2, asl #2 - vld1.32 {d®1&[0]}, [TMP1, :32] - vld1.32 {d®1&[1]}, [TMP2, :32] + add TMP2, \mem_operand, TMP2, asl #2 + vld1.32 {d\()\reg1\()[0]}, [TMP1, :32] + vld1.32 {d\()\reg1\()[1]}, [TMP2, :32] .else .error "unsupported" .endif .endm .macro pixld2_s elem_size, reg1, reg2, mem_operand .if 0 /* elem_size == 32 */ mov TMP1, VX, asr #16 add VX, VX, UNIT_X, asl #1 - add TMP1, mem_operand, TMP1, asl #2 + add TMP1, \mem_operand, TMP1, asl #2 mov TMP2, VX, asr #16 sub VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, asl #2 - vld1.32 {d®1&[0]}, [TMP1, :32] + add TMP2, \mem_operand, TMP2, asl #2 + vld1.32 {d\()\reg1\()[0]}, [TMP1, :32] mov TMP1, VX, asr #16 add VX, VX, UNIT_X, asl #1 - add TMP1, mem_operand, TMP1, asl #2 - vld1.32 {d®2&[0]}, [TMP2, :32] + add TMP1, \mem_operand, TMP1, asl #2 + vld1.32 {d\()\reg2\()[0]}, [TMP2, :32] mov TMP2, VX, asr #16 add VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, asl #2 - vld1.32 {d®1&[1]}, [TMP1, :32] - vld1.32 {d®2&[1]}, [TMP2, :32] + add TMP2, \mem_operand, TMP2, asl #2 + vld1.32 {d\()\reg1\()[1]}, [TMP1, :32] + vld1.32 {d\()\reg2\()[1]}, [TMP2, :32] .else - pixld1_s elem_size, reg1, mem_operand - pixld1_s elem_size, reg2, mem_operand + pixld1_s \elem_size, \reg1, \mem_operand + pixld1_s \elem_size, \reg2, \mem_operand .endif .endm .macro pixld0_s elem_size, reg1, idx, mem_operand -.if elem_size == 16 +.if \elem_size == 16 mov TMP1, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP1, mem_operand, TMP1, asl #1 - vld1.16 {d®1&[idx]}, [TMP1, :16] -.elseif elem_size == 32 + add TMP1, \mem_operand, TMP1, asl #1 + vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16] +.elseif \elem_size == 32 mov TMP1, VX, asr #16 adds VX, VX, UNIT_X 5: subpls VX, VX, SRC_WIDTH_FIXED bpl 5b - add TMP1, mem_operand, TMP1, asl #2 - vld1.32 {d®1&[idx]}, [TMP1, :32] + add TMP1, \mem_operand, TMP1, asl #2 + vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32] .endif .endm .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand -.if numbytes == 32 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand - pixdeinterleave elem_size, %(basereg+4) -.elseif numbytes == 16 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand -.elseif numbytes == 8 - pixld1_s elem_size, %(basereg+1), mem_operand -.elseif numbytes == 4 - .if elem_size == 32 - pixld0_s elem_size, %(basereg+0), 1, mem_operand - .elseif elem_size == 16 - pixld0_s elem_size, %(basereg+0), 2, mem_operand - pixld0_s elem_size, %(basereg+0), 3, mem_operand +.if \numbytes == 32 + pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand + pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand + pixdeinterleave \elem_size, %(\basereg+4) +.elseif \numbytes == 16 + pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand +.elseif \numbytes == 8 + pixld1_s \elem_size, %(\basereg+1), \mem_operand +.elseif \numbytes == 4 + .if \elem_size == 32 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand + .elseif \elem_size == 16 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand .else - pixld0_s elem_size, %(basereg+0), 4, mem_operand - pixld0_s elem_size, %(basereg+0), 5, mem_operand - pixld0_s elem_size, %(basereg+0), 6, mem_operand - pixld0_s elem_size, %(basereg+0), 7, mem_operand + pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand .endif -.elseif numbytes == 2 - .if elem_size == 16 - pixld0_s elem_size, %(basereg+0), 1, mem_operand +.elseif \numbytes == 2 + .if \elem_size == 16 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand .else - pixld0_s elem_size, %(basereg+0), 2, mem_operand - pixld0_s elem_size, %(basereg+0), 3, mem_operand + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand .endif -.elseif numbytes == 1 - pixld0_s elem_size, %(basereg+0), 1, mem_operand +.elseif \numbytes == 1 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand .else - .error "unsupported size: numbytes" + .error "unsupported size: \numbytes" .endif .endm .macro pixld_s numpix, bpp, basereg, mem_operand -.if bpp > 0 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.if \bpp > 0 + pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand .endif .endm .macro vuzp8 reg1, reg2 - vuzp.8 d®1, d®2 + vuzp.8 d\()\reg1, d\()\reg2 .endm .macro vzip8 reg1, reg2 - vzip.8 d®1, d®2 + vzip.8 d\()\reg1, d\()\reg2 .endm /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ .macro pixdeinterleave bpp, basereg -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) - vuzp8 %(basereg+0), %(basereg+1) - vuzp8 %(basereg+2), %(basereg+3) - vuzp8 %(basereg+1), %(basereg+3) - vuzp8 %(basereg+0), %(basereg+2) +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vuzp8 %(\basereg+0), %(\basereg+1) + vuzp8 %(\basereg+2), %(\basereg+3) + vuzp8 %(\basereg+1), %(\basereg+3) + vuzp8 %(\basereg+0), %(\basereg+2) .endif .endm /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ .macro pixinterleave bpp, basereg -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) - vzip8 %(basereg+0), %(basereg+2) - vzip8 %(basereg+1), %(basereg+3) - vzip8 %(basereg+2), %(basereg+3) - vzip8 %(basereg+0), %(basereg+1) +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vzip8 %(\basereg+0), %(\basereg+2) + vzip8 %(\basereg+1), %(\basereg+3) + vzip8 %(\basereg+2), %(\basereg+3) + vzip8 %(\basereg+0), %(\basereg+1) .endif .endm /* * This is a macro for implementing cache preload. The main idea is that * cache preload logic is mostly independent from the rest of pixels * processing code. It starts at the top left pixel and moves forward * across pixels and can jump across scanlines. Prefetch distance is @@ -389,51 +389,51 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED * for almost zero cost! * * (*) The overhead of the prefetcher is visible when running some trivial * pixels processing like simple copy. Anyway, having prefetch is a must * when working with the graphics data. */ .macro PF a, x:vararg .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) - a x + \a \x .endif .endm .macro cache_preload std_increment, boost_increment .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) .if regs_shortage - PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ + PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ .endif -.if std_increment != 0 - PF add PF_X, PF_X, #std_increment +.if \std_increment != 0 + PF add, PF_X, PF_X, #\std_increment .endif - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #boost_increment - PF subne PF_CTL, PF_CTL, #1 - PF cmp PF_X, ORIG_W + PF tst, PF_CTL, #0xF + PF addne, PF_X, PF_X, #\boost_increment + PF subne, PF_CTL, PF_CTL, #1 + PF cmp, PF_X, ORIG_W .if src_bpp_shift >= 0 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] .endif .if dst_r_bpp != 0 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] .endif .if mask_bpp_shift >= 0 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] .endif - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 + PF subge, PF_X, PF_X, ORIG_W + PF subges, PF_CTL, PF_CTL, #0x10 .if src_bpp_shift >= 0 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! .endif .if dst_r_bpp != 0 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! .endif .if mask_bpp_shift >= 0 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! .endif .endif .endm .macro cache_preload_simple .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) .if src_bpp > 0 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] @@ -460,51 +460,53 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED .macro ensure_destination_ptr_alignment process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head .if dst_w_bpp != 24 tst DST_R, #0xF beq 2f .irp lowbit, 1, 2, 4, 8, 16 +#ifndef __clang__ local skip1 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) -.if lowbit < 16 /* we don't need more than 16-byte alignment */ - tst DST_R, #lowbit +#endif +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if \lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_R, #\lowbit beq 1f .endif - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK + pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC + pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK .if dst_r_bpp > 0 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R + pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R .else - add DST_R, DST_R, #lowbit + add DST_R, DST_R, #\lowbit .endif - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) - sub W, W, #(lowbit * 8 / dst_w_bpp) + PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp) + sub W, W, #(\lowbit * 8 / dst_w_bpp) 1: .endif .endr pixdeinterleave src_bpp, src_basereg pixdeinterleave mask_bpp, mask_basereg pixdeinterleave dst_r_bpp, dst_r_basereg - process_pixblock_head + \process_pixblock_head cache_preload 0, pixblock_size cache_preload_simple - process_pixblock_tail + \process_pixblock_tail pixinterleave dst_w_bpp, dst_w_basereg .irp lowbit, 1, 2, 4, 8, 16 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) -.if lowbit < 16 /* we don't need more than 16-byte alignment */ - tst DST_W, #lowbit +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if \lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_W, #\lowbit beq 1f .endif - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W + pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 1: .endif .endr .endif 2: .endm /* @@ -525,51 +527,51 @@ 2: .macro process_trailing_pixels cache_preload_flag, \ dst_aligned_flag, \ process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head tst W, #(pixblock_size - 1) beq 2f .irp chunk_size, 16, 8, 4, 2, 1 -.if pixblock_size > chunk_size - tst W, #chunk_size +.if pixblock_size > \chunk_size + tst W, #\chunk_size beq 1f - pixld_src chunk_size, src_bpp, src_basereg, SRC - pixld chunk_size, mask_bpp, mask_basereg, MASK -.if dst_aligned_flag != 0 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R + pixld_src \chunk_size, src_bpp, src_basereg, SRC + pixld \chunk_size, mask_bpp, mask_basereg, MASK +.if \dst_aligned_flag != 0 + pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R .else - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R + pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R .endif -.if cache_preload_flag != 0 - PF add PF_X, PF_X, #chunk_size +.if \cache_preload_flag != 0 + PF add, PF_X, PF_X, #\chunk_size .endif 1: .endif .endr pixdeinterleave src_bpp, src_basereg pixdeinterleave mask_bpp, mask_basereg pixdeinterleave dst_r_bpp, dst_r_basereg - process_pixblock_head -.if cache_preload_flag != 0 + \process_pixblock_head +.if \cache_preload_flag != 0 cache_preload 0, pixblock_size cache_preload_simple .endif - process_pixblock_tail + \process_pixblock_tail pixinterleave dst_w_bpp, dst_w_basereg .irp chunk_size, 16, 8, 4, 2, 1 -.if pixblock_size > chunk_size - tst W, #chunk_size +.if pixblock_size > \chunk_size + tst W, #\chunk_size beq 1f -.if dst_aligned_flag != 0 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.if \dst_aligned_flag != 0 + pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W .else - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W + pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W .endif 1: .endif .endr 2: .endm /* @@ -599,17 +601,17 @@ 2: .if (mask_bpp != 24) && (mask_bpp != 0) sub MASK, MASK, W, lsl #mask_bpp_shift .endif subs H, H, #1 mov DST_R, DST_W .if regs_shortage str H, [sp, #4] /* save updated height to stack */ .endif - bge start_of_loop_label + bge \start_of_loop_label .endm /* * Registers are allocated in the following way by default: * d0, d1, d2, d3 - reserved for loading source pixel data * d4, d5, d6, d7 - reserved for loading destination pixel data * d24, d25, d26, d27 - reserved for loading mask pixel data * d28, d29, d30, d31 - final destination pixel data for writeback to memory @@ -626,48 +628,48 @@ 2: process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head, \ dst_w_basereg_ = 28, \ dst_r_basereg_ = 4, \ src_basereg_ = 0, \ mask_basereg_ = 24 - pixman_asm_function fname + pixman_asm_function \fname push {r4-r12, lr} /* save all registers */ /* * Select prefetch type for this function. If prefetch distance is * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch * has to be used instead of ADVANCED. */ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT -.if prefetch_distance == 0 +.if \prefetch_distance == 0 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) + ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24)) .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE .endif /* * Make some macro arguments globally visible and accessible * from other macros */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set pixblock_size, pixblock_size_ - .set dst_w_basereg, dst_w_basereg_ - .set dst_r_basereg, dst_r_basereg_ - .set src_basereg, src_basereg_ - .set mask_basereg, mask_basereg_ + .set src_bpp, \src_bpp_ + .set mask_bpp, \mask_bpp_ + .set dst_w_bpp, \dst_w_bpp_ + .set pixblock_size, \pixblock_size_ + .set dst_w_basereg, \dst_w_basereg_ + .set dst_r_basereg, \dst_r_basereg_ + .set src_basereg, \src_basereg_ + .set mask_basereg, \mask_basereg_ .macro pixld_src x:vararg - pixld x + pixld \x .endm .macro fetch_src_pixblock pixld_src pixblock_size, src_bpp, \ (src_basereg - pixblock_size * src_bpp / 64), SRC .endm /* * Assign symbolic names to registers */ @@ -750,38 +752,38 @@ 2: .elseif dst_w_bpp == 16 .set dst_bpp_shift, 1 .elseif dst_w_bpp == 8 .set dst_bpp_shift, 0 .else .error "requested dst bpp (dst_w_bpp) is not supported" .endif -.if (((flags) & FLAG_DST_READWRITE) != 0) +.if (((\flags) & FLAG_DST_READWRITE) != 0) .set dst_r_bpp, dst_w_bpp .else .set dst_r_bpp, 0 .endif -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) .set DEINTERLEAVE_32BPP_ENABLED, 1 .else .set DEINTERLEAVE_32BPP_ENABLED, 0 .endif -.if prefetch_distance < 0 || prefetch_distance > 15 - .error "invalid prefetch distance (prefetch_distance)" +.if \prefetch_distance < 0 || \prefetch_distance > 15 + .error "invalid prefetch distance (\prefetch_distance)" .endif .if src_bpp > 0 ldr SRC, [sp, #40] .endif .if mask_bpp > 0 ldr MASK, [sp, #48] .endif - PF mov PF_X, #0 + PF mov, PF_X, #0 .if src_bpp > 0 ldr SRC_STRIDE, [sp, #44] .endif .if mask_bpp > 0 ldr MASK_STRIDE, [sp, #52] .endif mov DST_R, DST_W @@ -796,24 +798,24 @@ 2: .if dst_w_bpp == 24 sub DST_STRIDE, DST_STRIDE, W sub DST_STRIDE, DST_STRIDE, W, lsl #1 .endif /* * Setup advanced prefetcher initial state */ - PF mov PF_SRC, SRC - PF mov PF_DST, DST_R - PF mov PF_MASK, MASK + PF mov, PF_SRC, SRC + PF mov, PF_DST, DST_R + PF mov, PF_MASK, MASK /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ - PF mov PF_CTL, H, lsl #4 - PF add PF_CTL, #(prefetch_distance - 0x10) + PF mov, PF_CTL, H, lsl #4 + PF add, PF_CTL, #(\prefetch_distance - 0x10) - init + \init .if regs_shortage push {r0, r1} .endif subs H, H, #1 .if regs_shortage str H, [sp, #4] /* save updated height to stack */ .else mov ORIG_W, W @@ -821,84 +823,84 @@ 2: blt 9f cmp W, #(pixblock_size * 2) blt 8f /* * This is the start of the pipelined loop, which if optimized for * long scanlines */ 0: - ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + ensure_destination_ptr_alignment \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ pixld_a pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - PF add PF_X, PF_X, #pixblock_size - process_pixblock_head + PF add, PF_X, PF_X, #pixblock_size + \process_pixblock_head cache_preload 0, pixblock_size cache_preload_simple subs W, W, #(pixblock_size * 2) blt 2f 1: - process_pixblock_tail_head + \process_pixblock_tail_head cache_preload_simple subs W, W, #pixblock_size bge 1b 2: - process_pixblock_tail + \process_pixblock_tail pixst_a pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W /* Process the remaining trailing pixels in the scanline */ process_trailing_pixels 1, 1, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head advance_to_next_scanline 0b .if regs_shortage pop {r0, r1} .endif - cleanup + \cleanup pop {r4-r12, pc} /* exit */ /* * This is the start of the loop, designed to process images with small width * (less than pixblock_size * 2 pixels). In this case neither pipelining * nor prefetch are used. */ 8: /* Process exactly pixblock_size pixels if needed */ tst W, #pixblock_size beq 1f pixld pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - process_pixblock_head - process_pixblock_tail + \process_pixblock_head + \process_pixblock_tail pixst pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 1: /* Process the remaining trailing pixels in the scanline */ process_trailing_pixels 0, 0, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head advance_to_next_scanline 8b 9: .if regs_shortage pop {r0, r1} .endif - cleanup + \cleanup pop {r4-r12, pc} /* exit */ .purgem fetch_src_pixblock .purgem pixld_src .unreq SRC .unreq MASK .unreq DST_R @@ -910,17 +912,17 @@ 9: .unreq DST_STRIDE .unreq MASK_STRIDE .unreq PF_CTL .unreq PF_X .unreq PF_SRC .unreq PF_DST .unreq PF_MASK .unreq DUMMY - .endfunc + pixman_end_asm_function .endm /* * A simplified variant of function generation template for a single * scanline processing (for implementing pixman combine functions) */ .macro generate_composite_function_scanline use_nearest_scaling, \ fname, \ @@ -934,49 +936,49 @@ 9: process_pixblock_head, \ process_pixblock_tail, \ process_pixblock_tail_head, \ dst_w_basereg_ = 28, \ dst_r_basereg_ = 4, \ src_basereg_ = 0, \ mask_basereg_ = 24 - pixman_asm_function fname + pixman_asm_function \fname .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE /* * Make some macro arguments globally visible and accessible * from other macros */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set pixblock_size, pixblock_size_ - .set dst_w_basereg, dst_w_basereg_ - .set dst_r_basereg, dst_r_basereg_ - .set src_basereg, src_basereg_ - .set mask_basereg, mask_basereg_ + .set src_bpp, \src_bpp_ + .set mask_bpp, \mask_bpp_ + .set dst_w_bpp, \dst_w_bpp_ + .set pixblock_size, \pixblock_size_ + .set dst_w_basereg, \dst_w_basereg_ + .set dst_r_basereg, \dst_r_basereg_ + .set src_basereg, \src_basereg_ + .set mask_basereg, \mask_basereg_ -.if use_nearest_scaling != 0 +.if \use_nearest_scaling != 0 /* * Assign symbolic names to registers for nearest scaling */ W .req r0 DST_W .req r1 SRC .req r2 VX .req r3 UNIT_X .req ip MASK .req lr TMP1 .req r4 TMP2 .req r5 DST_R .req r6 SRC_WIDTH_FIXED .req r7 .macro pixld_src x:vararg - pixld_s x + pixld_s \x .endm ldr UNIT_X, [sp] push {r4-r8, lr} ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] .if mask_bpp != 0 ldr MASK, [sp, #(24 + 8)] .endif @@ -986,89 +988,89 @@ 9: */ W .req r0 /* width (is updated during processing) */ DST_W .req r1 /* destination buffer pointer for writes */ SRC .req r2 /* source buffer pointer */ DST_R .req ip /* destination buffer pointer for reads */ MASK .req r3 /* mask pointer */ .macro pixld_src x:vararg - pixld x + pixld \x .endm .endif -.if (((flags) & FLAG_DST_READWRITE) != 0) +.if (((\flags) & FLAG_DST_READWRITE) != 0) .set dst_r_bpp, dst_w_bpp .else .set dst_r_bpp, 0 .endif -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0) .set DEINTERLEAVE_32BPP_ENABLED, 1 .else .set DEINTERLEAVE_32BPP_ENABLED, 0 .endif .macro fetch_src_pixblock pixld_src pixblock_size, src_bpp, \ (src_basereg - pixblock_size * src_bpp / 64), SRC .endm - init + \init mov DST_R, DST_W cmp W, #pixblock_size blt 8f - ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + ensure_destination_ptr_alignment \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head subs W, W, #pixblock_size blt 7f /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ pixld_a pixblock_size, dst_r_bpp, \ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R fetch_src_pixblock pixld pixblock_size, mask_bpp, \ (mask_basereg - pixblock_size * mask_bpp / 64), MASK - process_pixblock_head + \process_pixblock_head subs W, W, #pixblock_size blt 2f 1: - process_pixblock_tail_head + \process_pixblock_tail_head subs W, W, #pixblock_size bge 1b 2: - process_pixblock_tail + \process_pixblock_tail pixst_a pixblock_size, dst_w_bpp, \ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 7: /* Process the remaining trailing pixels in the scanline (dst aligned) */ process_trailing_pixels 0, 1, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head - cleanup -.if use_nearest_scaling != 0 + \cleanup +.if \use_nearest_scaling != 0 pop {r4-r8, pc} /* exit */ .else bx lr /* exit */ .endif 8: /* Process the remaining trailing pixels in the scanline (dst unaligned) */ process_trailing_pixels 0, 0, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head + \process_pixblock_head, \ + \process_pixblock_tail, \ + \process_pixblock_tail_head - cleanup + \cleanup -.if use_nearest_scaling != 0 +.if \use_nearest_scaling != 0 pop {r4-r8, pc} /* exit */ .unreq DST_R .unreq SRC .unreq W .unreq VX .unreq UNIT_X .unreq TMP1 @@ -1085,25 +1087,25 @@ 8: .unreq DST_R .unreq DST_W .unreq W .endif .purgem fetch_src_pixblock .purgem pixld_src - .endfunc + pixman_end_asm_function .endm .macro generate_composite_function_single_scanline x:vararg - generate_composite_function_scanline 0, x + generate_composite_function_scanline 0, \x .endm .macro generate_composite_function_nearest_scanline x:vararg - generate_composite_function_scanline 1, x + generate_composite_function_scanline 1, \x .endm /* Default prologue/epilogue, nothing special needs to be done */ .macro default_init .endm .macro default_cleanup @@ -1129,56 +1131,56 @@ 8: * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) * into a planar a8r8g8b8 format (with a, r, g, b color components * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). * * Warning: the conversion is destructive and the original * value (in) is lost. */ .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b - vshrn.u16 out_r, in, #8 - vshrn.u16 out_g, in, #3 - vsli.u16 in, in, #5 - vmov.u8 out_a, #255 - vsri.u8 out_r, out_r, #5 - vsri.u8 out_g, out_g, #6 - vshrn.u16 out_b, in, #2 + vshrn.u16 \out_r, \in, #8 + vshrn.u16 \out_g, \in, #3 + vsli.u16 \in, \in, #5 + vmov.u8 \out_a, #255 + vsri.u8 \out_r, \out_r, #5 + vsri.u8 \out_g, \out_g, #6 + vshrn.u16 \out_b, \in, #2 .endm .macro convert_0565_to_x888 in, out_r, out_g, out_b - vshrn.u16 out_r, in, #8 - vshrn.u16 out_g, in, #3 - vsli.u16 in, in, #5 - vsri.u8 out_r, out_r, #5 - vsri.u8 out_g, out_g, #6 - vshrn.u16 out_b, in, #2 + vshrn.u16 \out_r, \in, #8 + vshrn.u16 \out_g, \in, #3 + vsli.u16 \in, \in, #5 + vsri.u8 \out_r, \out_r, #5 + vsri.u8 \out_g, \out_g, #6 + vshrn.u16 \out_b, \in, #2 .endm /* * Conversion from planar a8r8g8b8 format (with a, r, g, b color components * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 * pixels packed in 128-bit register (out). Requires two temporary 128-bit * registers (tmp1, tmp2) */ .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 - vshll.u8 tmp1, in_g, #8 - vshll.u8 out, in_r, #8 - vshll.u8 tmp2, in_b, #8 - vsri.u16 out, tmp1, #5 - vsri.u16 out, tmp2, #11 + vshll.u8 \tmp1, \in_g, #8 + vshll.u8 \out, \in_r, #8 + vshll.u8 \tmp2, \in_b, #8 + vsri.u16 \out, \tmp1, #5 + vsri.u16 \out, \tmp2, #11 .endm /* * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels * returned in (out0, out1) registers pair. Requires one temporary * 64-bit register (tmp). 'out1' and 'in' may overlap, the original * value from 'in' is lost */ .macro convert_four_0565_to_x888_packed in, out0, out1, tmp - vshl.u16 out0, in, #5 /* G top 6 bits */ - vshl.u16 tmp, in, #11 /* B top 5 bits */ - vsri.u16 in, in, #5 /* R is ready in top bits */ - vsri.u16 out0, out0, #6 /* G is ready in top bits */ - vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ - vshr.u16 out1, in, #8 /* R is in place */ - vsri.u16 out0, tmp, #8 /* G & B is in place */ - vzip.u16 out0, out1 /* everything is in place */ + vshl.u16 \out0, \in, #5 /* G top 6 bits */ + vshl.u16 \tmp, \in, #11 /* B top 5 bits */ + vsri.u16 \in, \in, #5 /* R is ready in top bits */ + vsri.u16 \out0, \out0, #6 /* G is ready in top bits */ + vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */ + vshr.u16 \out1, \in, #8 /* R is in place */ + vsri.u16 \out0, \tmp, #8 /* G & B is in place */ + vzip.u16 \out0, \out1 /* everything is in place */ .endm diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S @@ -20,16 +20,20 @@ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Jeff Muizelaar (jeff@infidigm.net) * */ +#ifdef __clang__ +#define subpls subspl +#endif + /* Prevent the stack from becoming executable */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .arch armv6 .object_arch armv4 @@ -57,100 +61,105 @@ * prefetch_braking_distance - stop prefetching when that many pixels are * remaining before the end of scanline */ .macro generate_nearest_scanline_func fname, bpp_shift, t, \ prefetch_distance, \ prefetch_braking_distance -pixman_asm_function fname +pixman_asm_function \fname W .req r0 DST .req r1 SRC .req r2 VX .req r3 UNIT_X .req ip TMP1 .req r4 TMP2 .req r5 VXMASK .req r6 PF_OFFS .req r7 SRC_WIDTH_FIXED .req r8 ldr UNIT_X, [sp] push {r4, r5, r6, r7, r8, r10} - mvn VXMASK, #((1 << bpp_shift) - 1) + mvn VXMASK, #((1 << \bpp_shift) - 1) ldr SRC_WIDTH_FIXED, [sp, #28] /* define helper macro */ .macro scale_2_pixels - ldr&t TMP1, [SRC, TMP1] - and TMP2, VXMASK, VX, asr #(16 - bpp_shift) + ldr\()\t TMP1, [SRC, TMP1] + and TMP2, VXMASK, VX, asr #(16 - \bpp_shift) adds VX, VX, UNIT_X - str&t TMP1, [DST], #(1 << bpp_shift) + str\()\t TMP1, [DST], #(1 << \bpp_shift) 9: subpls VX, VX, SRC_WIDTH_FIXED bpl 9b - ldr&t TMP2, [SRC, TMP2] - and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + ldr\()\t TMP2, [SRC, TMP2] + and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) adds VX, VX, UNIT_X - str&t TMP2, [DST], #(1 << bpp_shift) + str\()\t TMP2, [DST], #(1 << \bpp_shift) 9: subpls VX, VX, SRC_WIDTH_FIXED bpl 9b .endm /* now do the scaling */ - and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) adds VX, VX, UNIT_X 9: subpls VX, VX, SRC_WIDTH_FIXED bpl 9b - subs W, W, #(8 + prefetch_braking_distance) + subs W, W, #(8 + \prefetch_braking_distance) blt 2f /* calculate prefetch offset */ - mov PF_OFFS, #prefetch_distance + mov PF_OFFS, #\prefetch_distance mla PF_OFFS, UNIT_X, PF_OFFS, VX 1: /* main loop, process 8 pixels per iteration with prefetch */ - pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] + pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)] add PF_OFFS, UNIT_X, lsl #3 scale_2_pixels scale_2_pixels scale_2_pixels scale_2_pixels subs W, W, #8 bge 1b 2: - subs W, W, #(4 - 8 - prefetch_braking_distance) + subs W, W, #(4 - 8 - \prefetch_braking_distance) blt 2f 1: /* process the remaining pixels */ scale_2_pixels scale_2_pixels subs W, W, #4 bge 1b 2: tst W, #2 beq 2f scale_2_pixels 2: tst W, #1 - ldrne&t TMP1, [SRC, TMP1] - strne&t TMP1, [DST] +#ifdef __clang__ + ldr\()\t\()ne TMP1, [SRC, TMP1] + str\()\t\()ne TMP1, [DST] +#else + ldrne\()\t TMP1, [SRC, TMP1] + strne\()\t TMP1, [DST] +#endif /* cleanup helper macro */ .purgem scale_2_pixels .unreq DST .unreq SRC .unreq W .unreq VX .unreq UNIT_X .unreq TMP1 .unreq TMP2 .unreq VXMASK .unreq PF_OFFS .unreq SRC_WIDTH_FIXED /* return */ pop {r4, r5, r6, r7, r8, r10} bx lr -.endfunc +pixman_end_asm_function .endm generate_nearest_scanline_func \ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 generate_nearest_scanline_func \ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S @@ -20,16 +20,21 @@ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Ben Avison (bavison@riscosopen.org) * */ +#ifdef __clang__ +#define adceqs adcseq +#define ldmnedb ldmdbne +#endif + /* Prevent the stack from becoming executable */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .arch armv6 .object_arch armv4 @@ -52,26 +57,26 @@ * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output */ .macro blit_init line_saved_regs STRIDE_D, STRIDE_S .endm .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src + pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src .endm .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment WK4 .req STRIDE_D WK5 .req STRIDE_S WK6 .req MASK WK7 .req STRIDE_M -110: pixld , 16, 0, SRC, unaligned_src - pixld , 16, 4, SRC, unaligned_src +110: pixld , 16, 0, SRC, \unaligned_src + pixld , 16, 4, SRC, \unaligned_src pld [SRC, SCRATCH] pixst , 16, 0, DST pixst , 16, 4, DST subs X, X, #32*8/src_bpp bhs 110b .unreq WK4 .unreq WK5 .unreq WK6 @@ -137,17 +142,17 @@ generate_composite_function \ mov STRIDE_M, SRC .endm .macro fill_process_tail cond, numbytes, firstreg WK4 .req SRC WK5 .req STRIDE_S WK6 .req MASK WK7 .req STRIDE_M - pixst cond, numbytes, 4, DST + pixst \cond, \numbytes, 4, DST .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ @@ -177,30 +182,30 @@ generate_composite_function \ nop_macro, /* newline */ \ nop_macro /* cleanup */ \ nop_macro /* process head */ \ fill_process_tail /******************************************************************************/ .macro src_x888_8888_pixel, cond, reg - orr&cond WK®, WK®, #0xFF000000 + orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000 .endm .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src + pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src .endm .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg - src_x888_8888_pixel cond, %(firstreg+0) - .if numbytes >= 8 - src_x888_8888_pixel cond, %(firstreg+1) - .if numbytes == 16 - src_x888_8888_pixel cond, %(firstreg+2) - src_x888_8888_pixel cond, %(firstreg+3) + src_x888_8888_pixel \cond, %(\firstreg+0) + .if \numbytes >= 8 + src_x888_8888_pixel \cond, %(\firstreg+1) + .if \numbytes == 16 + src_x888_8888_pixel \cond, %(\firstreg+2) + src_x888_8888_pixel \cond, %(\firstreg+3) .endif .endif .endm generate_composite_function \ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 3, /* prefetch distance */ \ @@ -217,83 +222,83 @@ generate_composite_function \ ldr MASK, =0x07E007E0 mov STRIDE_M, #0xFF000000 /* Set GE[3:0] to 1010 so SEL instructions do what we want */ ldr SCRATCH, =0x80008000 uadd8 SCRATCH, SCRATCH, SCRATCH .endm .macro src_0565_8888_2pixels, reg1, reg2 - and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 - bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg - mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 - mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG - bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 - orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 - orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 - pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- - sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- - mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg - pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- - sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- - orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb - orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG + bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000 + orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 + orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 + pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- + sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg + pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- + sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- + orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb + orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB .endm /* This version doesn't need STRIDE_M, but is one instruction longer. It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? - and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 - bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg - mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB - mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 - bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb - mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 - mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 - orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB - orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb - pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB - pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb - sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB - sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb - orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB - orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb + and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 + bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg + mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 + bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb + mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 + mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB + orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB + pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB + sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb + orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB + orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb */ .macro src_0565_8888_1pixel, reg - bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb - and WK®, WK®, MASK @ 000000000000000000000gggggg00000 - mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 - mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 - orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb - orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 - pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb - sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb - orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb + bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb + and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 + mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb + orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb + sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb + orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb .endm .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 16 - pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src - .elseif numbytes == 8 - pixld , 4, firstreg, SRC, unaligned_src - .elseif numbytes == 4 - pixld , 2, firstreg, SRC, unaligned_src + .if \numbytes == 16 + pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src + .elseif \numbytes == 8 + pixld , 4, \firstreg, SRC, \unaligned_src + .elseif \numbytes == 4 + pixld , 2, \firstreg, SRC, \unaligned_src .endif .endm .macro src_0565_8888_process_tail cond, numbytes, firstreg - .if numbytes == 16 - src_0565_8888_2pixels firstreg, %(firstreg+1) - src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) - .elseif numbytes == 8 - src_0565_8888_2pixels firstreg, %(firstreg+1) + .if \numbytes == 16 + src_0565_8888_2pixels \firstreg, %(\firstreg+1) + src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3) + .elseif \numbytes == 8 + src_0565_8888_2pixels \firstreg, %(\firstreg+1) .else - src_0565_8888_1pixel firstreg + src_0565_8888_1pixel \firstreg .endif .endm generate_composite_function \ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 3, /* prefetch distance */ \ src_0565_8888_init, \ @@ -306,67 +311,67 @@ generate_composite_function \ .macro src_x888_0565_init /* Hold loop invariant in MASK */ ldr MASK, =0x001F001F line_saved_regs STRIDE_S, ORIG_W .endm .macro src_x888_0565_1pixel s, d - and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb - and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000 - orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb - orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb + and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb + and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000 + orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb + orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb /* Top 16 bits are discarded during the following STRH */ .endm .macro src_x888_0565_2pixels slo, shi, d, tmp - and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 - and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB - and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb - orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB - orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB - and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000 - orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb - orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb - pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb + and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 + and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB + and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb + orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB + orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB + and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000 + orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb + orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb + pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb .endm .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_S WK5 .req STRIDE_M WK6 .req WK3 WK7 .req ORIG_W - .if numbytes == 16 + .if \numbytes == 16 pixld , 16, 4, SRC, 0 src_x888_0565_2pixels 4, 5, 0, 0 pixld , 8, 4, SRC, 0 src_x888_0565_2pixels 6, 7, 1, 1 pixld , 8, 6, SRC, 0 .else - pixld , numbytes*2, 4, SRC, 0 + pixld , \numbytes*2, 4, SRC, 0 .endif .endm .macro src_x888_0565_process_tail cond, numbytes, firstreg - .if numbytes == 16 + .if \numbytes == 16 src_x888_0565_2pixels 4, 5, 2, 2 src_x888_0565_2pixels 6, 7, 3, 4 - .elseif numbytes == 8 + .elseif \numbytes == 8 src_x888_0565_2pixels 4, 5, 1, 1 src_x888_0565_2pixels 6, 7, 2, 2 - .elseif numbytes == 4 + .elseif \numbytes == 4 src_x888_0565_2pixels 4, 5, 1, 1 .else src_x888_0565_1pixel 4, 1 .endif - .if numbytes == 16 - pixst , numbytes, 0, DST + .if \numbytes == 16 + pixst , \numbytes, 0, DST .else - pixst , numbytes, 1, DST + pixst , \numbytes, 1, DST .endif .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ @@ -377,47 +382,47 @@ generate_composite_function \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ src_x888_0565_process_head, \ src_x888_0565_process_tail /******************************************************************************/ .macro add_8_8_8pixels cond, dst1, dst2 - uqadd8&cond WK&dst1, WK&dst1, MASK - uqadd8&cond WK&dst2, WK&dst2, STRIDE_M + uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK + uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M .endm .macro add_8_8_4pixels cond, dst - uqadd8&cond WK&dst, WK&dst, MASK + uqadd8\()\cond WK\()\dst, WK\()\dst, MASK .endm .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req MASK WK5 .req STRIDE_M - .if numbytes == 16 - pixld cond, 8, 4, SRC, unaligned_src - pixld cond, 16, firstreg, DST, 0 - add_8_8_8pixels cond, firstreg, %(firstreg+1) - pixld cond, 8, 4, SRC, unaligned_src + .if \numbytes == 16 + pixld \cond, 8, 4, SRC, \unaligned_src + pixld \cond, 16, \firstreg, DST, 0 + add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) + pixld \cond, 8, 4, SRC, \unaligned_src .else - pixld cond, numbytes, 4, SRC, unaligned_src - pixld cond, numbytes, firstreg, DST, 0 + pixld \cond, \numbytes, 4, SRC, \unaligned_src + pixld \cond, \numbytes, \firstreg, DST, 0 .endif .unreq WK4 .unreq WK5 .endm .macro add_8_8_process_tail cond, numbytes, firstreg - .if numbytes == 16 - add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) - .elseif numbytes == 8 - add_8_8_8pixels cond, firstreg, %(firstreg+1) + .if \numbytes == 16 + add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3) + .elseif \numbytes == 8 + add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) .else - add_8_8_4pixels cond, firstreg + add_8_8_4pixels \cond, \firstreg .endif .endm generate_composite_function \ pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ 2, /* prefetch distance */ \ nop_macro, /* init */ \ @@ -436,82 +441,82 @@ generate_composite_function \ line_saved_regs STRIDE_D, STRIDE_S, ORIG_W .endm .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_D WK5 .req STRIDE_S WK6 .req STRIDE_M WK7 .req ORIG_W - pixld , numbytes, %(4+firstreg), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src + pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ - teq WK®0, #0 - .if numbytes > 4 - teqeq WK®1, #0 - .if numbytes > 8 - teqeq WK®2, #0 - teqeq WK®3, #0 + teq WK\()\reg0, #0 + .if \numbytes > 4 + teqeq WK\()\reg1, #0 + .if \numbytes > 8 + teqeq WK\()\reg2, #0 + teqeq WK\()\reg3, #0 .endif .endif .endm .macro over_8888_8888_prepare next - mov WK&next, WK&next, lsr #24 + mov WK\()\next, WK\()\next, lsr #24 .endm .macro over_8888_8888_1pixel src, dst, offset, next /* src = destination component multiplier */ - rsb WK&src, WK&src, #255 + rsb WK\()\src, WK\()\src, #255 /* Split even/odd bytes of dst into SCRATCH/dst */ - uxtb16 SCRATCH, WK&dst - uxtb16 WK&dst, WK&dst, ror #8 + uxtb16 SCRATCH, WK\()\dst + uxtb16 WK\()\dst, WK\()\dst, ror #8 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ - mla SCRATCH, SCRATCH, WK&src, MASK - mla WK&dst, WK&dst, WK&src, MASK + mla SCRATCH, SCRATCH, WK\()\src, MASK + mla WK\()\dst, WK\()\dst, WK\()\src, MASK /* Where we would have had a stall between the result of the first MLA and the shifter input, * reload the complete source pixel */ - ldr WK&src, [SRC, #offset] + ldr WK\()\src, [SRC, #\offset] /* Multiply by 257/256 to approximate 256/255 */ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 /* In this stall, start processing the next pixel */ - .if offset < -4 - mov WK&next, WK&next, lsr #24 + .if \offset < -4 + mov WK\()\next, WK\()\next, lsr #24 .endif - uxtab16 WK&dst, WK&dst, WK&dst, ror #8 + uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8 /* Recombine even/odd bytes of multiplied destination */ mov SCRATCH, SCRATCH, ror #8 - sel WK&dst, SCRATCH, WK&dst + sel WK\()\dst, SCRATCH, WK\()\dst /* Saturated add of source to multiplied destination */ - uqadd8 WK&dst, WK&dst, WK&src + uqadd8 WK\()\dst, WK\()\dst, WK\()\src .endm .macro over_8888_8888_process_tail cond, numbytes, firstreg WK4 .req STRIDE_D WK5 .req STRIDE_S WK6 .req STRIDE_M WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) + over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg) beq 10f - over_8888_8888_prepare %(4+firstreg) - .set PROCESS_REG, firstreg - .set PROCESS_OFF, -numbytes - .rept numbytes / 4 + over_8888_8888_prepare %(4+\firstreg) + .set PROCESS_REG, \firstreg + .set PROCESS_OFF, -\numbytes + .rept \numbytes / 4 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .set PROCESS_OFF, PROCESS_OFF+4 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ @@ -531,26 +536,26 @@ generate_composite_function \ * word Register containing 4 bytes * byte Register containing byte multiplier (bits 8-31 must be 0) * tmp Scratch register * half Register containing the constant 0x00800080 * GE[3:0] bits must contain 0101 */ .macro mul_8888_8 word, byte, tmp, half /* Split even/odd bytes of word apart */ - uxtb16 tmp, word - uxtb16 word, word, ror #8 + uxtb16 \tmp, \word + uxtb16 \word, \word, ror #8 /* Multiply bytes together with rounding, then by 257/256 */ - mla tmp, tmp, byte, half - mla word, word, byte, half /* 1 stall follows */ - uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ - uxtab16 word, word, word, ror #8 + mla \tmp, \tmp, \byte, \half + mla \word, \word, \byte, \half /* 1 stall follows */ + uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */ + uxtab16 \word, \word, \word, ror #8 /* Recombine bytes */ - mov tmp, tmp, ror #8 - sel word, tmp, word + mov \tmp, \tmp, ror #8 + sel \word, \tmp, \word .endm /******************************************************************************/ .macro over_8888_n_8888_init /* Mask is constant */ ldr MASK, [sp, #ARGS_STACK_OFFSET+8] /* Hold loop invariant in STRIDE_M */ @@ -562,51 +567,51 @@ generate_composite_function \ line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W .endm .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req Y WK5 .req STRIDE_D WK6 .req STRIDE_S WK7 .req ORIG_W - pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src + pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm .macro over_8888_n_8888_1pixel src, dst - mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M - sub WK7, WK6, WK&src, lsr #24 - mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M - uqadd8 WK&dst, WK&dst, WK&src + mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M + sub WK7, WK6, WK\()\src, lsr #24 + mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M + uqadd8 WK\()\dst, WK\()\dst, WK\()\src .endm .macro over_8888_n_8888_process_tail cond, numbytes, firstreg WK4 .req Y WK5 .req STRIDE_D WK6 .req STRIDE_S WK7 .req ORIG_W - over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) + over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg) beq 10f mov WK6, #255 - .set PROCESS_REG, firstreg - .rept numbytes / 4 - .if numbytes == 16 && PROCESS_REG == 2 + .set PROCESS_REG, \firstreg + .rept \numbytes / 4 + .if \numbytes == 16 && PROCESS_REG == 2 /* We're using WK6 and WK7 as temporaries, so half way through * 4 pixels, reload the second two source pixels but this time * into WK4 and WK5 */ ldmdb SRC, {WK4, WK5} .endif over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ @@ -637,47 +642,47 @@ generate_composite_function \ ldr STRIDE_D, =0x00800080 b 1f .ltorg 1: .endm .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_M - pixld , numbytes/4, 4, MASK, unaligned_mask - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes/4, 4, MASK, \unaligned_mask + pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .endm .macro over_n_8_8888_1pixel src, dst - uxtb Y, WK4, ror #src*8 + uxtb Y, WK4, ror #\src*8 /* Trailing part of multiplication of source */ mla SCRATCH, STRIDE_S, Y, STRIDE_D mla Y, SRC, Y, STRIDE_D mov ORIG_W, #255 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 uxtab16 Y, Y, Y, ror #8 mov SCRATCH, SCRATCH, ror #8 sub ORIG_W, ORIG_W, Y, lsr #24 sel Y, SCRATCH, Y /* Then multiply the destination */ - mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D - uqadd8 WK&dst, WK&dst, Y + mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D + uqadd8 WK\()\dst, WK\()\dst, Y .endm .macro over_n_8_8888_process_tail cond, numbytes, firstreg WK4 .req STRIDE_M teq WK4, #0 beq 10f - .set PROCESS_REG, firstreg - .rept numbytes / 4 - over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) + .set PROCESS_REG, \firstreg + .rept \numbytes / 4 + over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .endm generate_composite_function \ pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 2, /* prefetch distance */ \ @@ -700,64 +705,64 @@ generate_composite_function \ line_saved_regs STRIDE_D, ORIG_W .endm .macro over_reverse_n_8888_newline mov STRIDE_D, #0xFF .endm .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, \firstreg, DST, 0 .endm .macro over_reverse_n_8888_1pixel d, is_only - teq WK&d, #0 + teq WK\()\d, #0 beq 8f /* replace with source */ - bics ORIG_W, STRIDE_D, WK&d, lsr #24 - .if is_only == 1 + bics ORIG_W, STRIDE_D, WK\()\d, lsr #24 + .if \is_only == 1 beq 49f /* skip store */ .else beq 9f /* write same value back */ .endif mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 mov SCRATCH, SCRATCH, ror #8 sel ORIG_W, SCRATCH, ORIG_W - uqadd8 WK&d, WK&d, ORIG_W + uqadd8 WK\()\d, WK\()\d, ORIG_W b 9f -8: mov WK&d, SRC +8: mov WK\()\d, SRC 9: .endm .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 - .if numbytes == 4 - over_reverse_n_8888_1pixel reg1, 1 + .if \numbytes == 4 + over_reverse_n_8888_1pixel \reg1, 1 .else - and SCRATCH, WK®1, WK®2 - .if numbytes == 16 - and SCRATCH, SCRATCH, WK®3 - and SCRATCH, SCRATCH, WK®4 + and SCRATCH, WK\()\reg1, WK\()\reg2 + .if \numbytes == 16 + and SCRATCH, SCRATCH, WK\()\reg3 + and SCRATCH, SCRATCH, WK\()\reg4 .endif mvns SCRATCH, SCRATCH, asr #24 beq 49f /* skip store if all opaque */ - over_reverse_n_8888_1pixel reg1, 0 - over_reverse_n_8888_1pixel reg2, 0 - .if numbytes == 16 - over_reverse_n_8888_1pixel reg3, 0 - over_reverse_n_8888_1pixel reg4, 0 + over_reverse_n_8888_1pixel \reg1, 0 + over_reverse_n_8888_1pixel \reg2, 0 + .if \numbytes == 16 + over_reverse_n_8888_1pixel \reg3, 0 + over_reverse_n_8888_1pixel \reg4, 0 .endif .endif - pixst , numbytes, reg1, DST + pixst , \numbytes, \reg1, DST 49: .endm .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg - over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) + over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) .endm generate_composite_function \ pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 3, /* prefetch distance */ \ over_reverse_n_8888_init, \ over_reverse_n_8888_newline, \ @@ -789,30 +794,30 @@ generate_composite_function \ .unreq TMP1 .unreq TMP2 .unreq TMP3 .unreq WK4 .endm .macro over_white_8888_8888_ca_combine m, d uxtb16 TMP1, TMP0 /* rb_notmask */ - uxtb16 TMP2, d /* rb_dest; 1 stall follows */ + uxtb16 TMP2, \d /* rb_dest; 1 stall follows */ smlatt TMP3, TMP2, TMP1, HALF /* red */ smlabb TMP2, TMP2, TMP1, HALF /* blue */ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ - uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */ - smlatt d, TMP1, TMP0, HALF /* alpha */ + uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */ + smlatt \d, TMP1, TMP0, HALF /* alpha */ smlabb TMP1, TMP1, TMP0, HALF /* green */ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ - pkhbt TMP1, TMP1, d, lsl #16 /* ag */ + pkhbt TMP1, TMP1, \d, lsl #16 /* ag */ uxtab16 TMP0, TMP0, TMP0, ror #8 uxtab16 TMP1, TMP1, TMP1, ror #8 mov TMP0, TMP0, ror #8 - sel d, TMP0, TMP1 - uqadd8 d, d, m /* d is a late result */ + sel \d, TMP0, TMP1 + uqadd8 \d, \d, \m /* d is a late result */ .endm .macro over_white_8888_8888_ca_1pixel_head pixld , 4, 1, MASK, 0 pixld , 4, 3, DST, 0 .endm .macro over_white_8888_8888_ca_1pixel_tail @@ -848,29 +853,29 @@ 02: mvn TMP0, WK2 movcs WK4, WK2 b 04f 03: over_white_8888_8888_ca_combine WK2, WK4 04: pixst , 8, 3, DST 05: .endm .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 4 + .if \numbytes == 4 over_white_8888_8888_ca_1pixel_head .else - .if numbytes == 16 + .if \numbytes == 16 over_white_8888_8888_ca_2pixels_head over_white_8888_8888_ca_2pixels_tail .endif over_white_8888_8888_ca_2pixels_head .endif .endm .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg - .if numbytes == 4 + .if \numbytes == 4 over_white_8888_8888_ca_1pixel_tail .else over_white_8888_8888_ca_2pixels_tail .endif .endm generate_composite_function \ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ @@ -999,33 +1004,33 @@ 20: /* No simplifications possible - uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ 30: /* The destination buffer is already in the L1 cache, so * there's little point in amalgamating writes */ pixst , 4, 0, DST 40: .endm .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .rept (numbytes / 4) - 1 + .rept (\numbytes / 4) - 1 over_n_8888_8888_ca_1pixel_head over_n_8888_8888_ca_1pixel_tail .endr over_n_8888_8888_ca_1pixel_head .endm .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg over_n_8888_8888_ca_1pixel_tail .endm pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 ldr ip, [sp] cmp ip, #-1 beq pixman_composite_over_white_8888_8888_ca_asm_armv6 /* else drop through... */ - .endfunc + pixman_end_asm_function generate_composite_function \ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ 2, /* prefetch distance */ \ over_n_8888_8888_ca_init, \ nop_macro, /* newline */ \ over_n_8888_8888_ca_cleanup, \ over_n_8888_8888_ca_process_head, \ @@ -1040,94 +1045,94 @@ generate_composite_function \ uadd8 SCRATCH, MASK, MASK /* Offset the source pointer: we only need the alpha bytes */ add SRC, SRC, #3 line_saved_regs ORIG_W .endm .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 ldrb ORIG_W, [SRC], #4 - .if numbytes >= 8 - ldrb WK®1, [SRC], #4 - .if numbytes == 16 - ldrb WK®2, [SRC], #4 - ldrb WK®3, [SRC], #4 + .if \numbytes >= 8 + ldrb WK\()\reg1, [SRC], #4 + .if \numbytes == 16 + ldrb WK\()\reg2, [SRC], #4 + ldrb WK\()\reg3, [SRC], #4 .endif .endif - add DST, DST, #numbytes + add DST, DST, #\numbytes .endm .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) + in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2) .endm .macro in_reverse_8888_8888_1pixel s, d, offset, is_only - .if is_only != 1 - movs s, ORIG_W - .if offset != 0 - ldrb ORIG_W, [SRC, #offset] + .if \is_only != 1 + movs \s, ORIG_W + .if \offset != 0 + ldrb ORIG_W, [SRC, #\offset] .endif beq 01f teq STRIDE_M, #0xFF beq 02f .endif - uxtb16 SCRATCH, d /* rb_dest */ - uxtb16 d, d, ror #8 /* ag_dest */ - mla SCRATCH, SCRATCH, s, MASK - mla d, d, s, MASK + uxtb16 SCRATCH, \d /* rb_dest */ + uxtb16 \d, \d, ror #8 /* ag_dest */ + mla SCRATCH, SCRATCH, \s, MASK + mla \d, \d, \s, MASK uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 - uxtab16 d, d, d, ror #8 + uxtab16 \d, \d, \d, ror #8 mov SCRATCH, SCRATCH, ror #8 - sel d, SCRATCH, d + sel \d, SCRATCH, \d b 02f - .if offset == 0 + .if \offset == 0 48: /* Last mov d,#0 of the set - used as part of shortcut for * source values all 0 */ .endif -01: mov d, #0 +01: mov \d, #0 02: .endm .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 - .if numbytes == 4 + .if \numbytes == 4 teq ORIG_W, ORIG_W, asr #32 - ldrne WK®1, [DST, #-4] - .elseif numbytes == 8 - teq ORIG_W, WK®1 + ldrne WK\()\reg1, [DST, #-4] + .elseif \numbytes == 8 + teq ORIG_W, WK\()\reg1 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ - ldmnedb DST, {WK®1-WK®2} + ldmnedb DST, {WK\()\reg1-WK\()\reg2} .else - teq ORIG_W, WK®1 - teqeq ORIG_W, WK®2 - teqeq ORIG_W, WK®3 + teq ORIG_W, WK\()\reg1 + teqeq ORIG_W, WK\()\reg2 + teqeq ORIG_W, WK\()\reg3 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ - ldmnedb DST, {WK®1-WK®4} + ldmnedb DST, {WK\()\reg1-WK\()\reg4} .endif cmnne DST, #0 /* clear C if NE */ bcs 49f /* no writes to dest if source all -1 */ beq 48f /* set dest to all 0 if source all 0 */ - .if numbytes == 4 - in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 - str WK®1, [DST, #-4] - .elseif numbytes == 8 - in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 - stmdb DST, {WK®1-WK®2} + .if \numbytes == 4 + in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1 + str WK\()\reg1, [DST, #-4] + .elseif \numbytes == 8 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0 + stmdb DST, {WK\()\reg1-WK\()\reg2} .else - in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 - in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 - stmdb DST, {WK®1-WK®4} + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0 + stmdb DST, {WK\()\reg1-WK\()\reg4} .endif 49: .endm .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg - in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) + in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) .endm generate_composite_function \ pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ 2, /* prefetch distance */ \ in_reverse_8888_8888_init, \ nop_macro, /* newline */ \ @@ -1144,31 +1149,31 @@ generate_composite_function \ /* Hold multiplier for destination in STRIDE_M */ mov STRIDE_M, #255 sub STRIDE_M, STRIDE_M, SRC, lsr #24 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, MASK, MASK .endm .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld , numbytes, firstreg, DST, 0 + pixld , \numbytes, \firstreg, DST, 0 .endm .macro over_n_8888_1pixel dst - mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK - uqadd8 WK&dst, WK&dst, SRC + mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK + uqadd8 WK\()\dst, WK\()\dst, SRC .endm .macro over_n_8888_process_tail cond, numbytes, firstreg - .set PROCESS_REG, firstreg - .rept numbytes / 4 + .set PROCESS_REG, \firstreg + .rept \numbytes / 4 over_n_8888_1pixel %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr - pixst , numbytes, firstreg, DST + pixst , \numbytes, \firstreg, DST .endm generate_composite_function \ pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ 2, /* prefetch distance */ \ over_n_8888_init, \ nop_macro, /* newline */ \ diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h @@ -107,88 +107,120 @@ .set PREFETCH_TYPE_NONE, 0 .set PREFETCH_TYPE_STANDARD, 1 /* * Definitions of macros for load/store of pixel data. */ .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 - .if numbytes == 16 - .if unaligned == 1 - op&r&cond WK®0, [base], #4 - op&r&cond WK®1, [base], #4 - op&r&cond WK®2, [base], #4 - op&r&cond WK®3, [base], #4 + .if \numbytes == 16 + .if \unaligned == 1 + \op\()r\()\cond WK\()\reg0, [\base], #4 + \op\()r\()\cond WK\()\reg1, [\base], #4 + \op\()r\()\cond WK\()\reg2, [\base], #4 + \op\()r\()\cond WK\()\reg3, [\base], #4 .else - op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} +#ifdef __clang__ + \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} +#else + \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} +#endif .endif - .elseif numbytes == 8 - .if unaligned == 1 - op&r&cond WK®0, [base], #4 - op&r&cond WK®1, [base], #4 + .elseif \numbytes == 8 + .if \unaligned == 1 + \op\()r\()\cond WK\()\reg0, [\base], #4 + \op\()r\()\cond WK\()\reg1, [\base], #4 .else - op&m&cond&ia base!, {WK®0,WK®1} +#ifdef __clang__ + \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1} +#else + \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1} +#endif .endif - .elseif numbytes == 4 - op&r&cond WK®0, [base], #4 - .elseif numbytes == 2 - op&r&cond&h WK®0, [base], #2 - .elseif numbytes == 1 - op&r&cond&b WK®0, [base], #1 + .elseif \numbytes == 4 + \op\()r\()\cond WK\()\reg0, [\base], #4 + .elseif \numbytes == 2 +#ifdef __clang__ + \op\()rh\()\cond WK\()\reg0, [\base], #2 +#else + \op\()r\()\cond\()h WK\()\reg0, [\base], #2 +#endif + .elseif \numbytes == 1 +#ifdef __clang__ + \op\()rb\()\cond WK\()\reg0, [\base], #1 +#else + \op\()r\()\cond\()b WK\()\reg0, [\base], #1 +#endif .else - .error "unsupported size: numbytes" + .error "unsupported size: \numbytes" .endif .endm .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base - .if numbytes == 16 - stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} - .elseif numbytes == 8 - stm&cond&db base, {WK®0,WK®1} - .elseif numbytes == 4 - str&cond WK®0, [base, #-4] - .elseif numbytes == 2 - str&cond&h WK®0, [base, #-2] - .elseif numbytes == 1 - str&cond&b WK®0, [base, #-1] + .if \numbytes == 16 +#ifdef __clang__ + stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} +#else + stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3} +#endif + .elseif \numbytes == 8 +#ifdef __clang__ + stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1} +#else + stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1} +#endif + .elseif \numbytes == 4 + str\()\cond WK\()\reg0, [\base, #-4] + .elseif \numbytes == 2 +#ifdef __clang__ + strh\()\cond WK\()\reg0, [\base, #-2] +#else + str\()\cond\()h WK\()\reg0, [\base, #-2] +#endif + .elseif \numbytes == 1 +#ifdef __clang__ + strb\()\cond WK\()\reg0, [\base, #-1] +#else + str\()\cond\()b WK\()\reg0, [\base, #-1] +#endif .else - .error "unsupported size: numbytes" + .error "unsupported size: \numbytes" .endif .endm .macro pixld cond, numbytes, firstreg, base, unaligned - pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned + pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned .endm .macro pixst cond, numbytes, firstreg, base .if (flags) & FLAG_DST_READWRITE - pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base .else - pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base .endif .endm .macro PF a, x:vararg .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) - a x + \a \x .endif .endm .macro preload_leading_step1 bpp, ptr, base /* If the destination is already 16-byte aligned, then we need to preload * between 0 and prefetch_distance (inclusive) cache lines ahead so there * are no gaps when the inner loop starts. */ - .if bpp > 0 - PF bic, ptr, base, #31 + .if \bpp > 0 + PF bic, \ptr, \base, #31 .set OFFSET, 0 .rept prefetch_distance+1 - PF pld, [ptr, #OFFSET] + PF pld, [\ptr, #OFFSET] .set OFFSET, OFFSET+32 .endr .endif .endm .macro preload_leading_step2 bpp, bpp_shift, ptr, base /* However, if the destination is not 16-byte aligned, we may need to * preload more cache lines than that. The question we need to ask is: @@ -196,81 +228,81 @@ * by which the source pointer will be rounded down for preloading, and if * so, by how many cache lines? Effectively, we want to calculate * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp * inner_loop_offset = (src+leading_bytes)&31 * extra_needed = leading_bytes - inner_loop_offset * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only * possible when there are 4 src bytes for every 1 dst byte). */ - .if bpp > 0 - .ifc base,DST + .if \bpp > 0 + .ifc \base,DST /* The test can be simplified further when preloading the destination */ - PF tst, base, #16 + PF tst, \base, #16 PF beq, 61f .else - .if bpp/dst_w_bpp == 4 - PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift + .if \bpp/dst_w_bpp == 4 + PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift PF and, SCRATCH, SCRATCH, #31 - PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift + PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ PF bcs, 61f PF bpl, 60f PF pld, [ptr, #32*(prefetch_distance+2)] .else - PF mov, SCRATCH, base, lsl #32-5 - PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift - PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF mov, SCRATCH, \base, lsl #32-5 + PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift + PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift PF bls, 61f .endif .endif -60: PF pld, [ptr, #32*(prefetch_distance+1)] +60: PF pld, [\ptr, #32*(prefetch_distance+1)] 61: .endif .endm #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) .macro preload_middle bpp, base, scratch_holds_offset - .if bpp > 0 + .if \bpp > 0 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ - .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) - .if scratch_holds_offset - PF pld, [base, SCRATCH] + .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp) + .if \scratch_holds_offset + PF pld, [\base, SCRATCH] .else - PF bic, SCRATCH, base, #31 + PF bic, SCRATCH, \base, #31 PF pld, [SCRATCH, #32*prefetch_distance] .endif .endif .endif .endm .macro preload_trailing bpp, bpp_shift, base - .if bpp > 0 - .if bpp*pix_per_block > 256 + .if \bpp > 0 + .if \bpp*pix_per_block > 256 /* Calculations are more complex if more than one fetch per block */ - PF and, WK1, base, #31 - PF add, WK1, WK1, WK0, lsl #bpp_shift - PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) - PF bic, SCRATCH, base, #31 + PF and, WK1, \base, #31 + PF add, WK1, WK1, WK0, lsl #\bpp_shift + PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1) + PF bic, SCRATCH, \base, #31 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] PF add, SCRATCH, SCRATCH, #32 PF subs, WK1, WK1, #32 PF bhi, 80b .else /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ - PF mov, SCRATCH, base, lsl #32-5 - PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift + PF mov, SCRATCH, \base, lsl #32-5 + PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift PF adceqs, SCRATCH, SCRATCH, #0 /* The instruction above has two effects: ensures Z is only * set if C was clear (so Z indicates that both shifted quantities * were 0), and clears C if Z was set (so C indicates that the sum * of the shifted quantities was greater and not equal to 32) */ PF beq, 82f - PF bic, SCRATCH, base, #31 + PF bic, SCRATCH, \base, #31 PF bcc, 81f PF pld, [SCRATCH, #32*(prefetch_distance+2)] 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 82: .endif .endif .endm @@ -283,97 +315,97 @@ 82: * pixels) they cannot possibly straddle more than 2 32-byte cachelines, * meaning there's no need for a loop. * "bpp" - number of bits per pixel in the channel (source, mask or * destination) that's being preloaded, or 0 if this channel is not used * for reading * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) * "base" - base address register of channel to preload (SRC, MASK or DST) */ - .if bpp > 0 - .if narrow_case && (bpp <= dst_w_bpp) + .if \bpp > 0 + .if \narrow_case && (\bpp <= dst_w_bpp) /* In these cases, each line for each channel is in either 1 or 2 cache lines */ - PF bic, WK0, base, #31 + PF bic, WK0, \base, #31 PF pld, [WK0] - PF add, WK1, base, X, LSL #bpp_shift + PF add, WK1, \base, X, LSL #\bpp_shift PF sub, WK1, WK1, #1 PF bic, WK1, WK1, #31 PF cmp, WK1, WK0 PF beq, 90f PF pld, [WK1] 90: .else - PF bic, WK0, base, #31 + PF bic, WK0, \base, #31 PF pld, [WK0] - PF add, WK1, base, X, lsl #bpp_shift + PF add, WK1, \base, X, lsl #\bpp_shift PF sub, WK1, WK1, #1 PF bic, WK1, WK1, #31 PF cmp, WK1, WK0 PF beq, 92f 91: PF add, WK0, WK0, #32 PF cmp, WK0, WK1 PF pld, [WK0] PF bne, 91b 92: .endif .endif .endm .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 - .if decrementx - sub&cond X, X, #8*numbytes/dst_w_bpp + \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0 + .if \decrementx + sub\()\cond X, X, #8*\numbytes/dst_w_bpp .endif - process_tail cond, numbytes, firstreg + \process_tail \cond, \numbytes, \firstreg .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst cond, numbytes, firstreg, DST + pixst \cond, \numbytes, \firstreg, DST .endif .endm .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx .if (flags) & FLAG_BRANCH_OVER - .ifc cond,mi + .ifc \cond,mi bpl 100f .endif - .ifc cond,cs + .ifc \cond,cs bcc 100f .endif - .ifc cond,ne + .ifc \cond,ne beq 100f .endif - conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx 100: .else - conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx .endif .endm .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) /* Can't interleave reads and writes */ - test - conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx + \test + conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx .if (flags) & FLAG_PROCESS_CORRUPTS_PSR - test + \test .endif - conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx + conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx .else /* Can interleave reads and writes for better scheduling */ - test - process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 - process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 - .if decrementx - sub&cond1 X, X, #8*numbytes1/dst_w_bpp - sub&cond2 X, X, #8*numbytes2/dst_w_bpp + \test + \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0 + \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0 + .if \decrementx + sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp + sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp .endif - process_tail cond1, numbytes1, firstreg1 - process_tail cond2, numbytes2, firstreg2 - pixst cond1, numbytes1, firstreg1, DST - pixst cond2, numbytes2, firstreg2, DST + \process_tail \cond1, \numbytes1, \firstreg1 + \process_tail \cond2, \numbytes2, \firstreg2 + pixst \cond1, \numbytes1, \firstreg1, DST + pixst \cond2, \numbytes2, \firstreg2, DST .endif .endm .macro test_bits_1_0_ptr .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ .else @@ -395,22 +427,22 @@ 100: .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 .set DECREMENT_X, 0 sub X, X, WK0, lsr #dst_bpp_shift str X, [sp, #LINE_SAVED_REG_COUNT*4] mov X, WK0 .endif /* Use unaligned loads in all cases for simplicity */ .if dst_w_bpp == 8 - conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X + conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X .elseif dst_w_bpp == 16 test_bits_1_0_ptr - conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X + conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X .endif - conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X + conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 ldr X, [sp, #LINE_SAVED_REG_COUNT*4] .endif .endm .macro test_bits_3_2_pix movs SCRATCH, X, lsl #dst_bpp_shift+32-3 .endm @@ -419,169 +451,169 @@ 100: .if dst_w_bpp == 8 movs SCRATCH, X, lsl #dst_bpp_shift+32-1 .else movs SCRATCH, X, lsr #1 .endif .endm .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask - conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 + conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0 .if dst_w_bpp == 16 test_bits_1_0_pix - conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 + conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0 .elseif dst_w_bpp == 8 - conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 + conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0 .endif .endm .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 110: .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ .rept pix_per_block*dst_w_bpp/128 - process_head , 16, 0, unaligned_src, unaligned_mask, 1 + \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) preload_middle src_bpp, SRC, 1 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) preload_middle mask_bpp, MASK, 1 .else preload_middle src_bpp, SRC, 0 preload_middle mask_bpp, MASK, 0 .endif .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that * destination prefetches are 32-byte aligned. It's also the easiest channel to offset * preloads for, to achieve staggered prefetches for multiple channels, because there are * always two STMs per prefetch, so there is always an opposite STM on which to put the * preload. Note, no need to BIC the base register here */ - PF pld, [DST, #32*prefetch_distance - dst_alignment] + PF pld, [DST, #32*prefetch_distance - \dst_alignment] .endif - process_tail , 16, 0 + \process_tail , 16, 0 .if !((flags) & FLAG_PROCESS_DOES_STORE) pixst , 16, 0, DST .endif .set SUBBLOCK, SUBBLOCK+1 .endr subs X, X, #pix_per_block bhs 110b .endm .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ .if dst_r_bpp > 0 tst DST, #16 bne 111f - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS + \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS b 112f 111: .endif - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS + \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS 112: /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) PF and, WK0, X, #pix_per_block-1 .endif preload_trailing src_bpp, src_bpp_shift, SRC preload_trailing mask_bpp, mask_bpp_shift, MASK .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_trailing dst_r_bpp, dst_bpp_shift, DST .endif add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp /* The remainder of the line is handled identically to the medium case */ - medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask + medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask .endm .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 120: - process_head , 16, 0, unaligned_src, unaligned_mask, 0 - process_tail , 16, 0 + \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0 + \process_tail , 16, 0 .if !((flags) & FLAG_PROCESS_DOES_STORE) pixst , 16, 0, DST .endif subs X, X, #128/dst_w_bpp bhs 120b /* Trailing pixels */ tst X, #128/dst_w_bpp - 1 - beq exit_label - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask + beq \exit_label + trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask .endm .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask tst X, #16*8/dst_w_bpp - conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 + conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0 /* Trailing pixels */ /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask + trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask .endm .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ .if mask_bpp == 8 || mask_bpp == 16 tst MASK, #3 bne 141f .endif .if src_bpp == 8 || src_bpp == 16 tst SRC, #3 bne 140f .endif - action process_head, process_tail, process_inner_loop, exit_label, 0, 0 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0 .if src_bpp == 8 || src_bpp == 16 - b exit_label + b \exit_label 140: - action process_head, process_tail, process_inner_loop, exit_label, 1, 0 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0 .endif .if mask_bpp == 8 || mask_bpp == 16 - b exit_label + b \exit_label 141: .if src_bpp == 8 || src_bpp == 16 tst SRC, #3 bne 142f .endif - action process_head, process_tail, process_inner_loop, exit_label, 0, 1 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1 .if src_bpp == 8 || src_bpp == 16 - b exit_label + b \exit_label 142: - action process_head, process_tail, process_inner_loop, exit_label, 1, 1 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1 .endif .endif .endm .macro end_of_line restore_x, vars_spilled, loop_label, last_one - .if vars_spilled + .if \vars_spilled /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ /* This is ldmia sp,{} */ .word 0xE89D0000 | LINE_SAVED_REGS .endif subs Y, Y, #1 - .if vars_spilled + .if \vars_spilled .if (LINE_SAVED_REGS) & (1<<1) str Y, [sp] .endif .endif add DST, DST, STRIDE_D .if src_bpp > 0 add SRC, SRC, STRIDE_S .endif .if mask_bpp > 0 add MASK, MASK, STRIDE_M .endif - .if restore_x + .if \restore_x mov X, ORIG_W .endif - bhs loop_label - .ifc "last_one","" - .if vars_spilled + bhs \loop_label + .ifc "\last_one","" + .if \vars_spilled b 197f .else b 198f .endif .else - .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) + .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) b 198f .endif .endif .endm .macro generate_composite_function fname, \ src_bpp_, \ @@ -591,27 +623,27 @@ 142: prefetch_distance_, \ init, \ newline, \ cleanup, \ process_head, \ process_tail, \ process_inner_loop - pixman_asm_function fname + pixman_asm_function \fname /* * Make some macro arguments globally visible and accessible * from other macros */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set flags, flags_ - .set prefetch_distance, prefetch_distance_ + .set src_bpp, \src_bpp_ + .set mask_bpp, \mask_bpp_ + .set dst_w_bpp, \dst_w_bpp_ + .set flags, \flags_ + .set prefetch_distance, \prefetch_distance_ /* * Select prefetch type for this function. */ .if prefetch_distance == 0 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE .else .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD @@ -727,17 +759,17 @@ 142: .endif #ifdef DEBUG_PARAMS add Y, Y, #1 stmia sp, {r0-r7,pc} sub Y, Y, #1 #endif - init + \init .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 /* Reserve a word in which to store X during leading pixels */ sub sp, sp, #4 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 .endif @@ -768,47 +800,47 @@ 142: mov ORIG_W, X .if (flags) & FLAG_SPILL_LINE_VARS_WIDE /* This is stmdb sp!,{} */ .word 0xE92D0000 | LINE_SAVED_REGS .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 .endif 151: /* New line */ - newline + \newline preload_leading_step1 src_bpp, WK1, SRC preload_leading_step1 mask_bpp, WK2, MASK .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_leading_step1 dst_r_bpp, WK3, DST .endif ands WK0, DST, #15 beq 154f rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST .endif - leading_15bytes process_head, process_tail + leading_15bytes \process_head, \process_tail 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) and SCRATCH, SRC, #31 rsb SCRATCH, SCRATCH, #32*prefetch_distance .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) and SCRATCH, MASK, #31 rsb SCRATCH, SCRATCH, #32*prefetch_distance .endif - .ifc "process_inner_loop","" - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f + .ifc "\process_inner_loop","" + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f .else - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f .endif 157: /* Check for another line */ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b .if (flags) & FLAG_SPILL_LINE_VARS_WIDE .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 .endif @@ -820,80 +852,80 @@ 160: /* Medium case */ mov ORIG_W, X .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE /* This is stmdb sp!,{} */ .word 0xE92D0000 | LINE_SAVED_REGS .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 .endif 161: /* New line */ - newline + \newline preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ preload_line 0, mask_bpp, mask_bpp_shift, MASK .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_line 0, dst_r_bpp, dst_bpp_shift, DST .endif sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ ands WK0, DST, #15 beq 164f rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - leading_15bytes process_head, process_tail + leading_15bytes \process_head, \process_tail 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ - switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f + switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f 167: /* Check for another line */ end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b .ltorg 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ .if dst_w_bpp < 32 mov ORIG_W, X .endif .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE /* This is stmdb sp!,{} */ .word 0xE92D0000 | LINE_SAVED_REGS .endif 171: /* New line */ - newline + \newline preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ preload_line 1, mask_bpp, mask_bpp_shift, MASK .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 preload_line 1, dst_r_bpp, dst_bpp_shift, DST .endif .if dst_w_bpp == 8 tst DST, #3 beq 174f 172: subs X, X, #1 blo 177f - process_head , 1, 0, 1, 1, 0 - process_tail , 1, 0 + \process_head , 1, 0, 1, 1, 0 + \process_tail , 1, 0 .if !((flags) & FLAG_PROCESS_DOES_STORE) pixst , 1, 0, DST .endif tst DST, #3 bne 172b .elseif dst_w_bpp == 16 tst DST, #2 beq 174f subs X, X, #1 blo 177f - process_head , 2, 0, 1, 1, 0 - process_tail , 2, 0 + \process_head , 2, 0, 1, 1, 0 + \process_tail , 2, 0 .if !((flags) & FLAG_PROCESS_DOES_STORE) pixst , 2, 0, DST .endif .endif 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ - switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f + switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f 177: /* Check for another line */ end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 .endif @@ -903,17 +935,17 @@ 197: .endif 198: .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 add sp, sp, #4 .endif - cleanup + \cleanup #ifdef DEBUG_PARAMS add sp, sp, #9*4 /* junk the debug copy of arguments */ #endif 199: pop {r4-r11, pc} /* exit */ .ltorg @@ -927,23 +959,23 @@ 199: .unreq MASK .unreq STRIDE_M .unreq WK0 .unreq WK1 .unreq WK2 .unreq WK3 .unreq SCRATCH .unreq ORIG_W - .endfunc + pixman_end_asm_function .endm .macro line_saved_regs x:vararg .set LINE_SAVED_REGS, 0 .set LINE_SAVED_REG_COUNT, 0 - .irp SAVED_REG,x + .irp SAVED_REG,\x .ifc "SAVED_REG","Y" .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 .endif .ifc "SAVED_REG","STRIDE_D" .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 .endif