diff options
Diffstat (limited to '')
-rw-r--r-- | gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h | 1310 |
1 files changed, 1310 insertions, 0 deletions
diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h new file mode 100644 index 0000000000..5d93172176 --- /dev/null +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h @@ -0,0 +1,1310 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains a macro ('generate_composite_function') which can + * construct 2D image processing functions, based on a common template. + * Any combinations of source, destination and mask images with 8bpp, + * 16bpp, 24bpp, 32bpp color formats are supported. + * + * This macro takes care of: + * - handling of leading and trailing unaligned pixels + * - doing most of the work related to L2 cache preload + * - encourages the use of software pipelining for better instructions + * scheduling + * + * The user of this macro has to provide some configuration parameters + * (bit depths for the images, prefetch distance, etc.) and a set of + * macros, which should implement basic code chunks responsible for + * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage + * examples. + * + * TODO: + * - try overlapped pixel method (from Ian Rickards) when processing + * exactly two blocks of pixels + * - maybe add an option to do reverse scanline processing + */ + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_DEINTERLEAVE_32BPP, 2 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ +.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ +.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ + +/* + * prefetch mode + * available modes are: + * pldl1keep + * pldl1strm + * pldl2keep + * pldl2strm + * pldl3keep + * pldl3strm + */ +#define PREFETCH_MODE pldl1keep + +/* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + +.macro pixldst1 op, elem_size, reg1, mem_operand, abits + op {v®1&.&elem_size}, [&mem_operand&], #8 +.endm + +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits + op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 +.endm + +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 +.endm + +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes + op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& +.endm + +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 +.endm + +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand + op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 +.endm + +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +.if numbytes == 32 + .if elem_size==32 + pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .elseif elem_size==16 + pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .else + pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits + .endif +.elseif numbytes == 16 + .if elem_size==32 + pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits + .elseif elem_size==16 + pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits + .else + pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits + .endif +.elseif numbytes == 8 + .if elem_size==32 + pixldst1 op, 2s, %(basereg+1), mem_operand, abits + .elseif elem_size==16 + pixldst1 op, 4h, %(basereg+1), mem_operand, abits + .else + pixldst1 op, 8b, %(basereg+1), mem_operand, abits + .endif +.elseif numbytes == 4 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) + pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 + .elseif elem_size == 16 + pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 + pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 + .else + pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 + .endif +.elseif numbytes == 2 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) + pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 + .else + pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 + pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 + .endif +.elseif numbytes == 1 + pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixst numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.elseif numpix * bpp == 32 && abits == 32 + pixldst 4, st1, 32, basereg, mem_operand, abits +.elseif numpix * bpp == 16 && abits == 16 + pixldst 2, st1, 16, basereg, mem_operand, abits +.else + pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixld_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixld numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +.macro pixst_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixst numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +/* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ +.macro pixld1_s elem_size, reg1, mem_operand +.if elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #1 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP2, mem_operand, TMP2, lsl #1 + ld1 {v®1&.h}[0], [TMP1] + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #1 + ld1 {v®1&.h}[1], [TMP2] + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP2, mem_operand, TMP2, lsl #1 + ld1 {v®1&.h}[2], [TMP1] + ld1 {v®1&.h}[3], [TMP2] +.elseif elem_size == 32 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #2 + asr TMP2, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP2, mem_operand, TMP2, lsl #2 + ld1 {v®1&.s}[0], [TMP1] + ld1 {v®1&.s}[1], [TMP2] +.else + .error "unsupported" +.endif +.endm + +.macro pixld2_s elem_size, reg1, reg2, mem_operand +.if 0 /* elem_size == 32 */ + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + ld1 {v®1&.s}[0], [TMP1] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + ld1 {v®2&.s}[0], [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + ld1 {v®1&.s}[1], [TMP1] + ld1 {v®2&.s}[1], [TMP2] +.else + pixld1_s elem_size, reg1, mem_operand + pixld1_s elem_size, reg2, mem_operand +.endif +.endm + +.macro pixld0_s elem_size, reg1, idx, mem_operand +.if elem_size == 16 + asr TMP1, VX, #16 + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #1 + ld1 {v®1&.h}[idx], [TMP1] +.elseif elem_size == 32 + asr DUMMY, VX, #16 + mov TMP1, DUMMY + adds VX, VX, UNIT_X + bmi 55f +5: subs VX, VX, SRC_WIDTH_FIXED + bpl 5b +55: + add TMP1, mem_operand, TMP1, lsl #2 + ld1 {v®1&.s}[idx], [TMP1] +.endif +.endm + +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +.if numbytes == 32 + pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand + pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand + pixdeinterleave elem_size, %(basereg+4) +.elseif numbytes == 16 + pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +.elseif numbytes == 8 + pixld1_s elem_size, %(basereg+1), mem_operand +.elseif numbytes == 4 + .if elem_size == 32 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .elseif elem_size == 16 + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 4, mem_operand + pixld0_s elem_size, %(basereg+0), 5, mem_operand + pixld0_s elem_size, %(basereg+0), 6, mem_operand + pixld0_s elem_size, %(basereg+0), 7, mem_operand + .endif +.elseif numbytes == 2 + .if elem_size == 16 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .endif +.elseif numbytes == 1 + pixld0_s elem_size, %(basereg+0), 1, mem_operand +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld_s numpix, bpp, basereg, mem_operand +.if bpp > 0 + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.endif +.endm + +.macro vuzp8 reg1, reg2 + umov DUMMY, v16.d[0] + uzp1 v16.8b, v®1&.8b, v®2&.8b + uzp2 v®2&.8b, v®1&.8b, v®2&.8b + mov v®1&.8b, v16.8b + mov v16.d[0], DUMMY +.endm + +.macro vzip8 reg1, reg2 + umov DUMMY, v16.d[0] + zip1 v16.8b, v®1&.8b, v®2&.8b + zip2 v®2&.8b, v®1&.8b, v®2&.8b + mov v®1&.8b, v16.8b + mov v16.d[0], DUMMY +.endm + +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixdeinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vuzp8 %(basereg+0), %(basereg+1) + vuzp8 %(basereg+2), %(basereg+3) + vuzp8 %(basereg+1), %(basereg+3) + vuzp8 %(basereg+0), %(basereg+2) +.endif +.endm + +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vzip8 %(basereg+0), %(basereg+2) + vzip8 %(basereg+1), %(basereg+3) + vzip8 %(basereg+2), %(basereg+3) + vzip8 %(basereg+0), %(basereg+1) +.endif +.endm + +/* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is + * handled in an 'incremental' way: it starts from 0 and advances to the + * optimal distance over time. After reaching optimal prefetch distance, + * it is kept constant. There are some checks which prevent prefetching + * unneeded pixel lines below the image (but it still can prefetch a bit + * more data on the right side of the image - not a big issue and may + * be actually helpful when rendering text glyphs). Additional trick is + * the use of LDR instruction for prefetch instead of PLD when moving to + * the next line, the point is that we have a high chance of getting TLB + * miss in this case, and PLD would be useless. + * + * This sounds like it may introduce a noticeable overhead (when working with + * fully cached data). But in reality, due to having a separate pipeline and + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can + * execute simultaneously with NEON and be completely shadowed by it. Thus + * we get no performance overhead at all (*). This looks like a very nice + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, + * but still can implement some rather advanced prefetch logic in software + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ +.macro PF a, x:vararg +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) + a x +.endif +.endm + +.macro cache_preload std_increment, boost_increment +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +.if std_increment != 0 + PF add PF_X, PF_X, #std_increment +.endif + PF tst PF_CTL, #0xF + PF beq 71f + PF add PF_X, PF_X, #boost_increment + PF sub PF_CTL, PF_CTL, #1 +71: + PF cmp PF_X, ORIG_W +.if src_bpp_shift >= 0 + PF lsl DUMMY, PF_X, #src_bpp_shift + PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] +.endif +.if dst_r_bpp != 0 + PF lsl DUMMY, PF_X, #dst_bpp_shift + PF prfm PREFETCH_MODE, [PF_DST, DUMMY] +.endif +.if mask_bpp_shift >= 0 + PF lsl DUMMY, PF_X, #mask_bpp_shift + PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] +.endif + PF ble 71f + PF sub PF_X, PF_X, ORIG_W + PF subs PF_CTL, PF_CTL, #0x10 +71: + PF ble 72f +.if src_bpp_shift >= 0 + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF ldrsb DUMMY, [PF_SRC, DUMMY] + PF add PF_SRC, PF_SRC, #1 +.endif +.if dst_r_bpp != 0 + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF ldrsb DUMMY, [PF_DST, DUMMY] + PF add PF_DST, PF_DST, #1 +.endif +.if mask_bpp_shift >= 0 + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF ldrsb DUMMY, [PF_MASK, DUMMY] + PF add PF_MASK, PF_MASK, #1 +.endif +72: +.endif +.endm + +.macro cache_preload_simple +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) +.if src_bpp > 0 + prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] +.endif +.if dst_r_bpp > 0 + prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] +.endif +.if mask_bpp > 0 + prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] +.endif +.endif +.endm + +.macro fetch_mask_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +.endm + +/* + * Macro which is used to process leading pixels until destination + * pointer is properly aligned (at 16 bytes boundary). When destination + * buffer uses 16bpp format, this is unnecessary, or even pointless. + */ +.macro ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head +.if dst_w_bpp != 24 + tst DST_R, #0xF + beq 52f + +.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 +.irp lowbit, 1, 2, 4, 8, 16 +local skip1 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_R, #lowbit + beq 51f +.endif + pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC + pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK +.if dst_r_bpp > 0 + pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R +.else + add DST_R, DST_R, #lowbit +.endif + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) + sub W, W, #(lowbit * 8 / dst_w_bpp) +51: +.endif +.endr +.endif + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg + +.irp lowbit, 1, 2, 4, 8, 16 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_W, #lowbit + beq 51f +.endif +.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 + sub W, W, #(lowbit * 8 / dst_w_bpp) +.endif + pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W +51: +.endif +.endr +.endif +52: +.endm + +/* + * Special code for processing up to (pixblock_size - 1) remaining + * trailing pixels. As SIMD processing performs operation on + * pixblock_size pixels, anything smaller than this has to be loaded + * and stored in a special way. Loading and storing of pixel data is + * performed in such a way that we fill some 'slots' in the NEON + * registers (some slots naturally are unused), then perform compositing + * operation as usual. In the end, the data is taken from these 'slots' + * and saved to memory. + * + * cache_preload_flag - allows to suppress prefetch if + * set to 0 + * dst_aligned_flag - selects whether destination buffer + * is aligned + */ +.macro process_trailing_pixels cache_preload_flag, \ + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 52f +.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 51f + pixld_src chunk_size, src_bpp, src_basereg, SRC + pixld chunk_size, mask_bpp, mask_basereg, MASK +.if dst_aligned_flag != 0 + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.else + pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.endif +.if cache_preload_flag != 0 + PF add PF_X, PF_X, #chunk_size +.endif +51: +.endif +.endr +.endif + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head +.if cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple +.endif + process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 51f +.if dst_aligned_flag != 0 + pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.else + pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.endif +51: +.endif +.endr +52: +.endm + +/* + * Macro, which performs all the needed operations to switch to the next + * scanline and start the next loop iteration unless all the scanlines + * are already processed. + */ +.macro advance_to_next_scanline start_of_loop_label + mov W, ORIG_W + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift +.if src_bpp != 0 + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift +.endif +.if mask_bpp != 0 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift +.endif +.if (dst_w_bpp != 24) + sub DST_W, DST_W, W, lsl #dst_bpp_shift +.endif +.if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift +.endif +.if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift +.endif + subs H, H, #1 + mov DST_R, DST_W + bge start_of_loop_label +.endm + +/* + * Registers are allocated in the following way by default: + * v0, v1, v2, v3 - reserved for loading source pixel data + * v4, v5, v6, v7 - reserved for loading destination pixel data + * v24, v25, v26, v27 - reserved for loading mask pixel data + * v28, v29, v30, v31 - final destination pixel data for writeback to memory + */ +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + prefetch_distance, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + pixman_asm_function fname + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 232 /* push all registers */ + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 + stp x8, x9, [x29, -80] + stp x10, x11, [x29, -96] + stp x12, x13, [x29, -112] + stp x14, x15, [x29, -128] + stp x16, x17, [x29, -144] + stp x18, x19, [x29, -160] + stp x20, x21, [x29, -176] + stp x22, x23, [x29, -192] + stp x24, x25, [x29, -208] + stp x26, x27, [x29, -224] + str x28, [x29, -232] + +/* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +.if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ + ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE +.endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + + .macro pixld_src x:vararg + pixld x + .endm + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm +/* + * Assign symbolic names to registers + */ + W .req x0 /* width (is updated during processing) */ + H .req x1 /* height (is updated during processing) */ + DST_W .req x2 /* destination buffer pointer for writes */ + DST_STRIDE .req x3 /* destination image stride */ + SRC .req x4 /* source buffer pointer */ + SRC_STRIDE .req x5 /* source image stride */ + MASK .req x6 /* mask pointer */ + MASK_STRIDE .req x7 /* mask stride */ + + DST_R .req x8 /* destination buffer pointer for reads */ + + PF_CTL .req x9 /* combined lines counter and prefetch */ + /* distance increment counter */ + PF_X .req x10 /* pixel index in a scanline for current */ + /* pretetch position */ + PF_SRC .req x11 /* pointer to source scanline start */ + /* for prefetch purposes */ + PF_DST .req x12 /* pointer to destination scanline start */ + /* for prefetch purposes */ + PF_MASK .req x13 /* pointer to mask scanline start */ + /* for prefetch purposes */ + + ORIG_W .req x14 /* saved original width */ + DUMMY .req x15 /* temporary register */ + + sxtw x0, w0 + sxtw x1, w1 + sxtw x3, w3 + sxtw x5, w5 + sxtw x7, w7 + + .set mask_bpp_shift, -1 +.if src_bpp == 32 + .set src_bpp_shift, 2 +.elseif src_bpp == 24 + .set src_bpp_shift, 0 +.elseif src_bpp == 16 + .set src_bpp_shift, 1 +.elseif src_bpp == 8 + .set src_bpp_shift, 0 +.elseif src_bpp == 0 + .set src_bpp_shift, -1 +.else + .error "requested src bpp (src_bpp) is not supported" +.endif +.if mask_bpp == 32 + .set mask_bpp_shift, 2 +.elseif mask_bpp == 24 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 8 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 0 + .set mask_bpp_shift, -1 +.else + .error "requested mask bpp (mask_bpp) is not supported" +.endif +.if dst_w_bpp == 32 + .set dst_bpp_shift, 2 +.elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 +.elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 +.elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 +.else + .error "requested dst bpp (dst_w_bpp) is not supported" +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + +.if prefetch_distance < 0 || prefetch_distance > 15 + .error "invalid prefetch distance (prefetch_distance)" +.endif + + PF mov PF_X, #0 + mov DST_R, DST_W + +.if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 +.endif +.if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 +.endif +.if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 +.endif + +/* + * Setup advanced prefetcher initial state + */ + PF mov PF_SRC, SRC + PF mov PF_DST, DST_R + PF mov PF_MASK, MASK + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ + PF lsl DUMMY, H, #4 + PF mov PF_CTL, DUMMY + PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) + + init + subs H, H, #1 + mov ORIG_W, W + blt 9f + cmp W, #(pixblock_size * 2) + blt 800f +/* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ +0: + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + PF add PF_X, PF_X, #pixblock_size + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 200f + +100: + process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 100b + +200: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 0b + + cleanup +1000: + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] + ldp x16, x17, [x29, -144] + ldp x18, x19, [x29, -160] + ldp x20, x21, [x29, -176] + ldp x22, x23, [x29, -192] + ldp x24, x25, [x29, -208] + ldp x26, x27, [x29, -224] + ldr x28, [x29, -232] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +/* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ +800: +.if src_bpp_shift >= 0 + PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift + PF prfm PREFETCH_MODE, [SRC, DUMMY] +.endif +.if dst_r_bpp != 0 + PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift + PF prfm PREFETCH_MODE, [DST_R, DUMMY] +.endif +.if mask_bpp_shift >= 0 + PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift + PF prfm PREFETCH_MODE, [MASK, DUMMY] +.endif + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 100f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +100: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 800b +9: + cleanup + /* pop all registers */ + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldp x10, x11, [x29, -96] + ldp x12, x13, [x29, -112] + ldp x14, x15, [x29, -128] + ldp x16, x17, [x29, -144] + ldp x18, x19, [x29, -160] + ldp x20, x21, [x29, -176] + ldp x22, x23, [x29, -192] + ldp x24, x25, [x29, -208] + ldp x26, x27, [x29, -224] + ldr x28, [x29, -232] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + + .purgem fetch_src_pixblock + .purgem pixld_src + + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq ORIG_W + .unreq W + .unreq H + .unreq SRC_STRIDE + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY + .endfunc +.endm + +/* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ +.macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + pixman_asm_function fname + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + +.if use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req x0 + DST_W .req x1 + SRC .req x2 + VX .req x3 + UNIT_X .req x4 + SRC_WIDTH_FIXED .req x5 + MASK .req x6 + TMP1 .req x8 + TMP2 .req x9 + DST_R .req x10 + DUMMY .req x30 + + .macro pixld_src x:vararg + pixld_s x + .endm + + sxtw x0, w0 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 88 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + stp x8, x9, [x29, -80] + str x10, [x29, -88] +.else + /* + * Assign symbolic names to registers + */ + W .req x0 /* width (is updated during processing) */ + DST_W .req x1 /* destination buffer pointer for writes */ + SRC .req x2 /* source buffer pointer */ + MASK .req x3 /* mask pointer */ + DST_R .req x4 /* destination buffer pointer for reads */ + DUMMY .req x30 + + .macro pixld_src x:vararg + pixld x + .endm + + sxtw x0, w0 + + stp x29, x30, [sp, -16]! + mov x29, sp + sub sp, sp, 64 + sub x29, x29, 64 + st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + + init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 800f + + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 700f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + subs W, W, #pixblock_size + blt 200f +100: + process_pixblock_tail_head + subs W, W, #pixblock_size + bge 100b +200: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +700: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup +.if use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -96] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +.else + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ +.endif +800: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup +.if use_nearest_scaling != 0 + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + ldp x8, x9, [x29, -80] + ldr x10, [x29, -88] + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + + .unreq DUMMY + .unreq DST_R + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq DST_W + .unreq MASK + .unreq SRC_WIDTH_FIXED + +.else + sub x29, x29, 64 + ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 + ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 + mov sp, x29 + ldp x29, x30, [sp], 16 + ret /* exit */ + + .unreq DUMMY + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq W +.endif + + .purgem fetch_src_pixblock + .purgem pixld_src + + .endfunc +.endm + +.macro generate_composite_function_single_scanline x:vararg + generate_composite_function_scanline 0, x +.endm + +.macro generate_composite_function_nearest_scanline x:vararg + generate_composite_function_scanline 1, x +.endm + +/* Default prologue/epilogue, nothing special needs to be done */ + +.macro default_init +.endm + +.macro default_cleanup +.endm + +/* + * Prologue/epilogue variant which additionally saves/restores v8-v15 + * registers (they need to be saved/restored by callee according to ABI). + * This is required if the code needs to use all the NEON registers. + */ + +.macro default_init_need_all_regs +.endm + +.macro default_cleanup_need_all_regs +.endm + +/******************************************************************************/ + +/* + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) + * into a planar a8r8g8b8 format (with a, r, g, b color components + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). + * + * Warning: the conversion is destructive and the original + * value (in) is lost. + */ +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b + shrn &out_r&.8b, &in&.8h, #8 + shrn &out_g&.8b, &in&.8h, #3 + sli &in&.8h, &in&.8h, #5 + movi &out_a&.8b, #255 + sri &out_r&.8b, &out_r&.8b, #5 + sri &out_g&.8b, &out_g&.8b, #6 + shrn &out_b&.8b, &in&.8h, #2 +.endm + +.macro convert_0565_to_x888 in, out_r, out_g, out_b + shrn &out_r&.8b, &in&.8h, #8 + shrn &out_g&.8b, &in&.8h, #3 + sli &in&.8h, &in&.8h, #5 + sri &out_r&.8b, &out_r&.8b, #5 + sri &out_g&.8b, &out_g&.8b, #6 + shrn &out_b&.8b, &in&.8h, #2 +.endm + +/* + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit + * registers (tmp1, tmp2) + */ +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 + ushll &tmp1&.8h, &in_g&.8b, #7 + shl &tmp1&.8h, &tmp1&.8h, #1 + ushll &out&.8h, &in_r&.8b, #7 + shl &out&.8h, &out&.8h, #1 + ushll &tmp2&.8h, &in_b&.8b, #7 + shl &tmp2&.8h, &tmp2&.8h, #1 + sri &out&.8h, &tmp1&.8h, #5 + sri &out&.8h, &tmp2&.8h, #11 +.endm + +/* + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels + * returned in (out0, out1) registers pair. Requires one temporary + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original + * value from 'in' is lost + */ +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp + shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ + shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ + sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ + sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ + sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ + ushr &out1&.4h, &in&.4h, #8 /* R is in place */ + sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ + zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ + zip2 &out1&.4h, &out0&.4h, &out1&.4h + mov &out0&.d[0], &tmp&.d[0] +.endm |