summaryrefslogtreecommitdiffstats
path: root/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S')
-rw-r--r--gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S1275
1 files changed, 1275 insertions, 0 deletions
diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
new file mode 100644
index 0000000000..31d103d1d9
--- /dev/null
+++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
@@ -0,0 +1,1275 @@
+/*
+ * Copyright © 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author: Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ * 1. bilinear interpolate source pixels
+ * 2. load mask pixels
+ * 3. load destination pixels
+ * 4. duplicate mask to fill whole register
+ * 5. interleave source & destination pixels
+ * 6. apply mask to source pixels
+ * 7. combine source & destination pixels
+ * 8, Deinterleave final result
+ * 9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * Remarks
+ * There can be lots of pipeline stalls inside code block and between code blocks.
+ * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.arch armv8-a
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arma64-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ ld1 {&reg1&.2s}, [TMP1], STRIDE
+ ld1 {&reg2&.2s}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ ld1 {&reg2&.s}[0], [TMP1], STRIDE
+ ld1 {&reg2&.s}[1], [TMP1]
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 reg1, reg2, tmp1
+ umull &acc1&.8h, &reg1&.8b, v28.8b
+ umlal &acc1&.8h, &reg2&.8b, v29.8b
+ bilinear_load_8888 reg3, reg4, tmp2
+ umull &acc2&.8h, &reg3&.8b, v28.8b
+ umlal &acc2&.8h, &reg4&.8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro vzip reg1, reg2
+ zip1 v24.8b, reg1, reg2
+ zip2 reg2, reg1, reg2
+ mov reg1, v24.8b
+.endm
+
+.macro vuzp reg1, reg2
+ uzp1 v24.8b, reg1, reg2
+ uzp2 reg2, reg1, reg2
+ mov reg1, v24.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {&acc2&.s}[0], [TMP1], STRIDE
+ ld1 {&acc2&.s}[2], [TMP2], STRIDE
+ ld1 {&acc2&.s}[1], [TMP1]
+ ld1 {&acc2&.s}[3], [TMP2]
+ convert_0565_to_x888 acc2, reg3, reg2, reg1
+ vzip &reg1&.8b, &reg3&.8b
+ vzip &reg2&.8b, &reg4&.8b
+ vzip &reg3&.8b, &reg4&.8b
+ vzip &reg1&.8b, &reg2&.8b
+ umull &acc1&.8h, &reg1&.8b, v28.8b
+ umlal &acc1&.8h, &reg2&.8b, v29.8b
+ umull &acc2&.8h, &reg3&.8b, v28.8b
+ umlal &acc2&.8h, &reg4&.8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {&xacc2&.s}[0], [TMP1], STRIDE
+ ld1 {&xacc2&.s}[2], [TMP2], STRIDE
+ ld1 {&xacc2&.s}[1], [TMP1]
+ ld1 {&xacc2&.s}[3], [TMP2]
+ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #1
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #1
+ ld1 {&yacc2&.s}[0], [TMP1], STRIDE
+ vzip &xreg1&.8b, &xreg3&.8b
+ ld1 {&yacc2&.s}[2], [TMP2], STRIDE
+ vzip &xreg2&.8b, &xreg4&.8b
+ ld1 {&yacc2&.s}[1], [TMP1]
+ vzip &xreg3&.8b, &xreg4&.8b
+ ld1 {&yacc2&.s}[3], [TMP2]
+ vzip &xreg1&.8b, &xreg2&.8b
+ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+ umull &xacc1&.8h, &xreg1&.8b, v28.8b
+ vzip &yreg1&.8b, &yreg3&.8b
+ umlal &xacc1&.8h, &xreg2&.8b, v29.8b
+ vzip &yreg2&.8b, &yreg4&.8b
+ umull &xacc2&.8h, &xreg3&.8b, v28.8b
+ vzip &yreg3&.8b, &yreg4&.8b
+ umlal &xacc2&.8h, &xreg4&.8b, v29.8b
+ vzip &yreg1&.8b, &yreg2&.8b
+ umull &yacc1&.8h, &yreg1&.8b, v28.8b
+ umlal &yacc1&.8h, &yreg2&.8b, v29.8b
+ umull &yacc2&.8h, &yreg3&.8b, v28.8b
+ umlal &yacc2&.8h, &yreg4&.8b, v29.8b
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+ st1 {v0.2s, v1.2s}, [OUT], #16
+.elseif numpix == 2
+ st1 {v0.2s}, [OUT], #8
+.elseif numpix == 1
+ st1 {v0.s}[0], [OUT], #4
+.else
+ .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp v0.8b, v1.8b
+ vuzp v2.8b, v3.8b
+ vuzp v1.8b, v3.8b
+ vuzp v0.8b, v2.8b
+ convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
+.if numpix == 4
+ st1 {v1.4h}, [OUT], #8
+.elseif numpix == 2
+ st1 {v1.s}[0], [OUT], #4
+.elseif numpix == 1
+ st1 {v1.h}[0], [OUT], #2
+.else
+ .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+
+/*
+ * Macros for loading mask pixels into register 'mask'.
+ * dup must be done in somewhere else.
+ */
+.macro bilinear_load_mask_x numpix, mask
+.endm
+
+.macro bilinear_load_mask_8 numpix, mask
+.if numpix == 4
+ ld1 {&mask&.s}[0], [MASK], #4
+.elseif numpix == 2
+ ld1 {&mask&.h}[0], [MASK], #2
+.elseif numpix == 1
+ ld1 {&mask&.b}[0], [MASK], #1
+.else
+ .error bilinear_load_mask_8 numpix is unsupported
+.endif
+ prfm PREFETCH_MODE, [MASK, #prefetch_offset]
+.endm
+
+.macro bilinear_load_mask mask_fmt, numpix, mask
+ bilinear_load_mask_&mask_fmt numpix, mask
+.endm
+
+
+/*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.if numpix == 4
+ ld1 {&dst0&.2s, &dst1&.2s}, [OUT]
+.elseif numpix == 2
+ ld1 {&dst0&.2s}, [OUT]
+.elseif numpix == 1
+ ld1 {&dst0&.s}[0], [OUT]
+.else
+ .error bilinear_load_dst_8888 numpix is unsupported
+.endif
+ mov &dst01&.d[0], &dst0&.d[0]
+ mov &dst01&.d[1], &dst1&.d[0]
+ prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
+.endm
+
+.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+ bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+ bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+/*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+ *
+ * For two pixel case
+ * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+.macro bilinear_duplicate_mask_x numpix, mask
+.endm
+
+.macro bilinear_duplicate_mask_8 numpix, mask
+.if numpix == 4
+ dup &mask&.2s, &mask&.s[0]
+.elseif numpix == 2
+ dup &mask&.4h, &mask&.h[0]
+.elseif numpix == 1
+ dup &mask&.8b, &mask&.b[0]
+.else
+ .error bilinear_duplicate_mask_8 is unsupported
+.endif
+.endm
+
+.macro bilinear_duplicate_mask mask_fmt, numpix, mask
+ bilinear_duplicate_mask_&mask_fmt numpix, mask
+.endm
+
+/*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+ vuzp &src0&.8b, &src1&.8b
+ vuzp &dst0&.8b, &dst1&.8b
+ vuzp &src0&.8b, &src1&.8b
+ vuzp &dst0&.8b, &dst1&.8b
+ mov &src01&.d[1], &src1&.d[0]
+ mov &src01&.d[0], &src0&.d[0]
+ mov &dst01&.d[1], &dst1&.d[0]
+ mov &dst01&.d[0], &dst0&.d[0]
+.endm
+
+.macro bilinear_interleave_src_dst_x_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+ bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst \
+ mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+ bilinear_interleave_src_dst_&mask_fmt&_&op \
+ numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+
+/*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+.macro bilinear_apply_mask_to_src_x \
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+.endm
+
+.macro bilinear_apply_mask_to_src_8 \
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+
+ umull &tmp01&.8h, &src0&.8b, &mask&.8b
+ umull &tmp23&.8h, &src1&.8b, &mask&.8b
+ /* bubbles */
+ urshr &tmp45&.8h, &tmp01&.8h, #8
+ urshr &tmp67&.8h, &tmp23&.8h, #8
+ /* bubbles */
+ raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h
+ raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h
+ mov &src01&.d[0], &src0&.d[0]
+ mov &src01&.d[1], &src1&.d[0]
+.endm
+
+.macro bilinear_apply_mask_to_src \
+ mask_fmt, numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+
+ bilinear_apply_mask_to_src_&mask_fmt \
+ numpix, src0, src1, src01, mask, \
+ tmp01, tmp23, tmp45, tmp67
+.endm
+
+
+/*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+.macro bilinear_combine_src \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+.macro bilinear_combine_over \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+ dup &tmp8&.2s, &src1&.s[1]
+ /* bubbles */
+ mvn &tmp8&.8b, &tmp8&.8b
+ /* bubbles */
+ umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b
+ /* bubbles */
+ umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b
+ /* bubbles */
+ urshr &tmp45&.8h, &tmp01&.8h, #8
+ urshr &tmp67&.8h, &tmp23&.8h, #8
+ /* bubbles */
+ raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h
+ raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h
+ mov &dst01&.d[0], &dst0&.d[0]
+ mov &dst01&.d[1], &dst1&.d[0]
+ /* bubbles */
+ uqadd &src0&.8b, &dst0&.8b, &src0&.8b
+ uqadd &src1&.8b, &dst1&.8b, &src1&.8b
+ mov &src01&.d[0], &src0&.d[0]
+ mov &src01&.d[1], &src1&.d[0]
+.endm
+
+.macro bilinear_combine_add \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+ uqadd &src0&.8b, &dst0&.8b, &src0&.8b
+ uqadd &src1&.8b, &dst1&.8b, &src1&.8b
+ mov &src01&.d[0], &src0&.d[0]
+ mov &src01&.d[1], &src1&.d[0]
+.endm
+
+.macro bilinear_combine \
+ op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+
+ bilinear_combine_&op \
+ numpix, src0, src1, src01, dst0, dst1, dst01, \
+ tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+/*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+.macro bilinear_deinterleave numpix, dst0, dst1, dst01
+ vuzp &dst0&.8b, &dst1&.8b
+ /* bubbles */
+ vuzp &dst0&.8b, &dst1&.8b
+ mov &dst01&.d[0], &dst0&.d[0]
+ mov &dst01&.d[1], &dst1&.d[0]
+.endm
+
+.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+ bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+ bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+ bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+ bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+ bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+ bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+
+.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+ bilinear_load_&src_fmt v0, v1, v2
+ bilinear_load_mask mask_fmt, 1, v4
+ bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
+ umull v2.8h, v0.8b, v28.8b
+ umlal v2.8h, v1.8b, v29.8b
+ /* 5 cycles bubble */
+ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v2.4h, v15.h[0]
+ umlal2 v0.4s, v2.8h, v15.h[0]
+ /* 5 cycles bubble */
+ bilinear_duplicate_mask mask_fmt, 1, v4
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ /* 3 cycles bubble */
+ xtn v0.8b, v0.8h
+ /* 1 cycle bubble */
+ bilinear_interleave_src_dst \
+ mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
+ bilinear_apply_mask_to_src \
+ mask_fmt, 1, v0, v1, v0, v4, \
+ v3, v8, v10, v11
+ bilinear_combine \
+ op, 1, v0, v1, v0, v18, v19, v9, \
+ v3, v8, v10, v11, v5
+ bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
+ bilinear_store_&dst_fmt 1, v17, v18
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+ bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ v1, v11, v18, v19, v20, v21, v22, v23
+ bilinear_load_mask mask_fmt, 2, v4
+ bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
+ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v1.4h, v15.h[0]
+ umlal2 v0.4s, v1.8h, v15.h[0]
+ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v10.4s, v11.4h, v15.h[4]
+ umlal2 v10.4s, v11.8h, v15.h[4]
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ bilinear_duplicate_mask mask_fmt, 2, v4
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ xtn v0.8b, v0.8h
+ bilinear_interleave_src_dst \
+ mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
+ bilinear_apply_mask_to_src \
+ mask_fmt, 2, v0, v1, v0, v4, \
+ v3, v8, v10, v11
+ bilinear_combine \
+ op, 2, v0, v1, v0, v18, v19, v9, \
+ v3, v8, v10, v11, v5
+ bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
+ bilinear_store_&dst_fmt 2, v16, v17
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+ bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ v1, v11, v4, v5, v6, v7, v22, v23 \
+ v3, v9, v16, v17, v20, v21, v18, v19
+ prfm PREFETCH_MODE, [TMP1, PF_OFFS]
+ sub TMP1, TMP1, STRIDE
+ prfm PREFETCH_MODE, [TMP1, PF_OFFS]
+ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v1.4h, v15.h[0]
+ umlal2 v0.4s, v1.8h, v15.h[0]
+ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v10.4s, v11.4h, v15.h[4]
+ umlal2 v10.4s, v11.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v3.4h, v15.h[0]
+ umlal2 v2.4s, v3.8h, v15.h[0]
+ ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v8.4s, v9.4h, v15.h[4]
+ umlal2 v8.4s, v9.8h, v15.h[4]
+ add v12.8h, v12.8h, v13.8h
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ bilinear_load_mask mask_fmt, 4, v4
+ bilinear_duplicate_mask mask_fmt, 4, v4
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v2.8h
+ add v12.8h, v12.8h, v13.8h
+ bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
+ bilinear_interleave_src_dst \
+ mask_fmt, op, 4, v0, v1, v0, v2, v3, v11
+ bilinear_apply_mask_to_src \
+ mask_fmt, 4, v0, v1, v0, v4, \
+ v6, v8, v9, v10
+ bilinear_combine \
+ op, 4, v0, v1, v0, v2, v3, v1, \
+ v6, v8, v9, v10, v23
+ bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
+ bilinear_store_&dst_fmt 4, v6, v7
+.endm
+
+.set BILINEAR_FLAG_USE_MASK, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ * process_last_pixel - code block that interpolate one pixel and does not
+ * update horizontal weight
+ * process_two_pixels - code block that interpolate two pixels and update
+ * horizontal weight
+ * process_four_pixels - code block that interpolate four pixels and update
+ * horizontal weight
+ * process_pixblock_head - head part of middle loop
+ * process_pixblock_tail - tail part of middle loop
+ * process_pixblock_tail_head - tail_head of middle loop
+ * pixblock_size - number of pixels processed in a single middle loop
+ * prefetch_distance - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+ fname, \
+ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+ bilinear_process_last_pixel, \
+ bilinear_process_two_pixels, \
+ bilinear_process_four_pixels, \
+ bilinear_process_pixblock_head, \
+ bilinear_process_pixblock_tail, \
+ bilinear_process_pixblock_tail_head, \
+ pixblock_size, \
+ prefetch_distance, \
+ flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+ .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ OUT .req x0
+ TOP .req x1
+ BOTTOM .req x2
+ WT .req x3
+ WWT .req w3
+ WB .req x4
+ WWB .req w4
+ X .req w5
+ UX .req w6
+ WIDTH .req x7
+ TMP1 .req x10
+ WTMP1 .req w10
+ TMP2 .req x11
+ WTMP2 .req w11
+ PF_OFFS .req x12
+ TMP3 .req x13
+ WTMP3 .req w13
+ TMP4 .req x14
+ WTMP4 .req w14
+ STRIDE .req x15
+ DUMMY .req x30
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub sp, sp, 112
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ stp x10, x11, [x29, -80]
+ stp x12, x13, [x29, -96]
+ stp x14, x15, [x29, -112]
+.else
+ OUT .req x0
+ MASK .req x1
+ TOP .req x2
+ BOTTOM .req x3
+ WT .req x4
+ WWT .req w4
+ WB .req x5
+ WWB .req w5
+ X .req w6
+ UX .req w7
+ WIDTH .req x8
+ TMP1 .req x10
+ WTMP1 .req w10
+ TMP2 .req x11
+ WTMP2 .req w11
+ PF_OFFS .req x12
+ TMP3 .req x13
+ WTMP3 .req w13
+ TMP4 .req x14
+ WTMP4 .req w14
+ STRIDE .req x15
+ DUMMY .req x30
+
+ .set prefetch_offset, prefetch_distance
+
+ stp x29, x30, [sp, -16]!
+ mov x29, sp
+ sub x29, x29, 64
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ stp x10, x11, [x29, -80]
+ stp x12, x13, [x29, -96]
+ stp x14, x15, [x29, -112]
+ str x8, [x29, -120]
+ ldr w8, [x29, 16]
+ sub sp, sp, 120
+.endif
+
+ mov WTMP1, #prefetch_distance
+ umull PF_OFFS, WTMP1, UX
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 300f
+
+ dup v12.8h, X
+ dup v13.8h, UX
+ dup v28.8b, WWT
+ dup v29.8b, WWB
+ mov v25.d[0], v12.d[1]
+ mov v26.d[0], v13.d[0]
+ add v25.4h, v25.4h, v26.4h
+ mov v12.d[1], v25.d[0]
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 100f
+ tst OUT, #(1 << dst_bpp_shift)
+ beq 100f
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ bilinear_process_last_pixel
+ sub WIDTH, WIDTH, #1
+100:
+ add v13.8h, v13.8h, v13.8h
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+
+ cmp WIDTH, #2
+ blt 100f
+ tst OUT, #(1 << (dst_bpp_shift + 1))
+ beq 100f
+ bilinear_process_two_pixels
+ sub WIDTH, WIDTH, #2
+100:
+.if pixblock_size == 8
+ cmp WIDTH, #4
+ blt 100f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 100f
+ bilinear_process_four_pixels
+ sub WIDTH, WIDTH, #4
+100:
+.endif
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 100f
+ asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
+ bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 500f
+0:
+ bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #pixblock_size
+ bge 0b
+500:
+ bilinear_process_pixblock_tail
+100:
+.if pixblock_size == 8
+ tst WIDTH, #4
+ beq 200f
+ bilinear_process_four_pixels
+200:
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 200f
+ bilinear_process_two_pixels
+200:
+ tst WIDTH, #1
+ beq 300f
+ bilinear_process_last_pixel
+300:
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x10, x11, [x29, -80]
+ ldp x12, x13, [x29, -96]
+ ldp x14, x15, [x29, -112]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+.else
+ sub x29, x29, 64
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+ ldp x10, x11, [x29, -80]
+ ldp x12, x13, [x29, -96]
+ ldp x14, x15, [x29, -112]
+ ldr x8, [x29, -120]
+ mov sp, x29
+ ldp x29, x30, [sp], 16
+.endif
+ ret
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WWT
+ .unreq WB
+ .unreq WWB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq WTMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+ .unreq MASK
+.endif
+
+.endfunc
+
+.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+ bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+ bilinear_src_8888_8_8888_process_pixblock_tail
+ bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+ bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+ bilinear_src_8888_8_0565_process_pixblock_tail
+ bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+ bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+ bilinear_src_0565_8_x888_process_pixblock_tail
+ bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+ bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+ bilinear_src_0565_8_0565_process_pixblock_tail
+ bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #2
+
+ ld1 {v22.2s}, [TMP1], STRIDE
+ ld1 {v23.2s}, [TMP1]
+ asr WTMP3, X, #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, lsl #2
+ umull v8.8h, v22.8b, v28.8b
+ umlal v8.8h, v23.8b, v29.8b
+
+ ld1 {v22.2s}, [TMP2], STRIDE
+ ld1 {v23.2s}, [TMP2]
+ asr WTMP4, X, #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, lsl #2
+ umull v9.8h, v22.8b, v28.8b
+ umlal v9.8h, v23.8b, v29.8b
+
+ ld1 {v22.2s}, [TMP3], STRIDE
+ ld1 {v23.2s}, [TMP3]
+ umull v10.8h, v22.8b, v28.8b
+ umlal v10.8h, v23.8b, v29.8b
+
+ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v8.4h, v15.h[0]
+ umlal2 v0.4s, v8.8h, v15.h[0]
+
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ ld1 {v16.2s}, [TMP4], STRIDE
+ ld1 {v17.2s}, [TMP4]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ umull v11.8h, v16.8b, v28.8b
+ umlal v11.8h, v17.8b, v29.8b
+
+ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v1.4s, v9.4h, v15.h[4]
+ umlal2 v1.4s, v9.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v2.4s, v10.4h, v15.h[0]
+ umlal2 v2.4s, v10.8h, v15.h[0]
+ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v3.4s, v11.4h, v15.h[4]
+ umlal2 v3.4s, v11.8h, v15.h[4]
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ xtn v6.8b, v0.8h
+ xtn v7.8b, v2.8h
+ ld1 {v2.2s, v3.2s}, [OUT]
+ prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ dup v4.2s, v7.s[1]
+ mvn v4.8b, v4.8b
+ umull v11.8h, v2.8b, v4.8b
+ umull v2.8h, v3.8b, v4.8b
+ urshr v1.8h, v11.8h, #8
+ urshr v10.8h, v2.8h, #8
+ raddhn v3.8b, v10.8h, v2.8h
+ raddhn v2.8b, v1.8h, v11.8h
+ uqadd v6.8b, v2.8b, v6.8b
+ uqadd v7.8b, v3.8b, v7.8b
+ vuzp v6.8b, v7.8b
+ vuzp v6.8b, v7.8b
+ add v12.8h, v12.8h, v13.8h
+ st1 {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
+ asr WTMP1, X, #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, lsl #2
+ umlsl v2.4s, v10.4h, v15.h[0]
+ asr WTMP2, X, #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, lsl #2
+ umlal2 v2.4s, v10.8h, v15.h[0]
+ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
+ ld1 {v20.2s}, [TMP1], STRIDE
+ umlsl v3.4s, v11.4h, v15.h[4]
+ umlal2 v3.4s, v11.8h, v15.h[4]
+ ld1 {v21.2s}, [TMP1]
+ umull v8.8h, v20.8b, v28.8b
+ umlal v8.8h, v21.8b, v29.8b
+ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ ld1 {v22.2s}, [TMP2], STRIDE
+ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+ xtn v6.8b, v0.8h
+ ld1 {v23.2s}, [TMP2]
+ umull v9.8h, v22.8b, v28.8b
+ asr WTMP3, X, #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, lsl #2
+ asr WTMP4, X, #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, lsl #2
+ umlal v9.8h, v23.8b, v29.8b
+ xtn v7.8b, v2.8h
+ ld1 {v2.2s, v3.2s}, [OUT]
+ prfm PREFETCH_MODE, [OUT, PF_OFFS]
+ ld1 {v22.2s}, [TMP3], STRIDE
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ vuzp v6.8b, v7.8b
+ vuzp v2.8b, v3.8b
+ dup v4.2s, v7.s[1]
+ ld1 {v23.2s}, [TMP3]
+ mvn v4.8b, v4.8b
+ umull v10.8h, v22.8b, v28.8b
+ umlal v10.8h, v23.8b, v29.8b
+ umull v11.8h, v2.8b, v4.8b
+ umull v2.8h, v3.8b, v4.8b
+ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS
+ umlsl v0.4s, v8.4h, v15.h[0]
+ urshr v1.8h, v11.8h, #8
+ umlal2 v0.4s, v8.8h, v15.h[0]
+ urshr v8.8h, v2.8h, #8
+ raddhn v3.8b, v8.8h, v2.8h
+ raddhn v2.8b, v1.8h, v11.8h
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ ld1 {v16.2s}, [TMP4], STRIDE
+ uqadd v6.8b, v2.8b, v6.8b
+ uqadd v7.8b, v3.8b, v7.8b
+ ld1 {v17.2s}, [TMP4]
+ prfm PREFETCH_MODE, [TMP4, PF_OFFS]
+ umull v11.8h, v16.8b, v28.8b
+ umlal v11.8h, v17.8b, v29.8b
+ vuzp v6.8b, v7.8b
+ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
+ vuzp v6.8b, v7.8b
+ umlsl v1.4s, v9.4h, v15.h[4]
+ add v12.8h, v12.8h, v13.8h
+ umlal2 v1.4s, v9.8h, v15.h[4]
+ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+ add v12.8h, v12.8h, v13.8h
+ st1 {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+ bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+ bilinear_over_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+ bilinear_over_8888_8_8888_process_pixblock_tail
+ bilinear_over_8888_8_8888_process_pixblock_head
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+ bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+ bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+ bilinear_add_8888_8888_process_pixblock_tail
+ bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+ bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+ bilinear_add_8888_8_8888_process_pixblock_tail
+ bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_src_8888_8_8888_process_last_pixel, \
+ bilinear_src_8888_8_8888_process_two_pixels, \
+ bilinear_src_8888_8_8888_process_four_pixels, \
+ bilinear_src_8888_8_8888_process_pixblock_head, \
+ bilinear_src_8888_8_8888_process_pixblock_tail, \
+ bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+ 8888, 0565, 2, 1, \
+ bilinear_src_8888_8_0565_process_last_pixel, \
+ bilinear_src_8888_8_0565_process_two_pixels, \
+ bilinear_src_8888_8_0565_process_four_pixels, \
+ bilinear_src_8888_8_0565_process_pixblock_head, \
+ bilinear_src_8888_8_0565_process_pixblock_tail, \
+ bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+ 0565, 8888, 1, 2, \
+ bilinear_src_0565_8_x888_process_last_pixel, \
+ bilinear_src_0565_8_x888_process_two_pixels, \
+ bilinear_src_0565_8_x888_process_four_pixels, \
+ bilinear_src_0565_8_x888_process_pixblock_head, \
+ bilinear_src_0565_8_x888_process_pixblock_tail, \
+ bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+ 0565, 0565, 1, 1, \
+ bilinear_src_0565_8_0565_process_last_pixel, \
+ bilinear_src_0565_8_0565_process_two_pixels, \
+ bilinear_src_0565_8_0565_process_four_pixels, \
+ bilinear_src_0565_8_0565_process_pixblock_head, \
+ bilinear_src_0565_8_0565_process_pixblock_tail, \
+ bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8888_process_last_pixel, \
+ bilinear_over_8888_8888_process_two_pixels, \
+ bilinear_over_8888_8888_process_four_pixels, \
+ bilinear_over_8888_8888_process_pixblock_head, \
+ bilinear_over_8888_8888_process_pixblock_tail, \
+ bilinear_over_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_over_8888_8_8888_process_last_pixel, \
+ bilinear_over_8888_8_8888_process_two_pixels, \
+ bilinear_over_8888_8_8888_process_four_pixels, \
+ bilinear_over_8888_8_8888_process_pixblock_head, \
+ bilinear_over_8888_8_8888_process_pixblock_tail, \
+ bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8888_process_last_pixel, \
+ bilinear_add_8888_8888_process_two_pixels, \
+ bilinear_add_8888_8888_process_four_pixels, \
+ bilinear_add_8888_8888_process_pixblock_head, \
+ bilinear_add_8888_8888_process_pixblock_tail, \
+ bilinear_add_8888_8888_process_pixblock_tail_head, \
+ 4, 28, 0
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+ 8888, 8888, 2, 2, \
+ bilinear_add_8888_8_8888_process_last_pixel, \
+ bilinear_add_8888_8_8888_process_two_pixels, \
+ bilinear_add_8888_8_8888_process_four_pixels, \
+ bilinear_add_8888_8_8888_process_pixblock_head, \
+ bilinear_add_8888_8_8888_process_pixblock_tail, \
+ bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+ 4, 28, BILINEAR_FLAG_USE_MASK