/* * Copyright © 2012 Raspberry Pi Foundation * Copyright © 2012 RISC OS Open Ltd * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of the copyright holders not be used in * advertising or publicity pertaining to distribution of the software without * specific, written prior permission. The copyright holders make no * representations about the suitability of this software for any purpose. It * is provided "as is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Ben Avison (bavison@riscosopen.org) * */ #ifdef __clang__ #define adceqs adcseq #define ldmnedb ldmdbne #endif /* Prevent the stack from becoming executable */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .arch armv6 .object_arch armv4 .arm .altmacro .p2align 2 #include "pixman-arm-asm.h" #include "pixman-arm-simd-asm.h" /* A head macro should do all processing which results in an output of up to * 16 bytes, as far as the final load instruction. The corresponding tail macro * should complete the processing of the up-to-16 bytes. The calling macro will * sometimes choose to insert a preload or a decrement of X between them. * cond ARM condition code for code block * numbytes Number of output bytes that should be generated this time * firstreg First WK register in which to place output * unaligned_src Whether to use non-wordaligned loads of source image * unaligned_mask Whether to use non-wordaligned loads of mask image * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output */ .macro blit_init line_saved_regs STRIDE_D, STRIDE_S .endm .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src .endm .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment WK4 .req STRIDE_D WK5 .req STRIDE_S WK6 .req MASK WK7 .req STRIDE_M 110: pixld , 16, 0, SRC, \unaligned_src pixld , 16, 4, SRC, \unaligned_src pld [SRC, SCRATCH] pixst , 16, 0, DST pixst , 16, 4, DST subs X, X, #32*8/src_bpp bhs 110b .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 4, /* prefetch distance */ \ blit_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ blit_process_head, \ nop_macro, /* process tail */ \ blit_inner_loop generate_composite_function \ pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 4, /* prefetch distance */ \ blit_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ blit_process_head, \ nop_macro, /* process tail */ \ blit_inner_loop generate_composite_function \ pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 3, /* prefetch distance */ \ blit_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ blit_process_head, \ nop_macro, /* process tail */ \ blit_inner_loop /******************************************************************************/ .macro src_n_8888_init ldr SRC, [sp, #ARGS_STACK_OFFSET] mov STRIDE_S, SRC mov MASK, SRC mov STRIDE_M, SRC .endm .macro src_n_0565_init ldrh SRC, [sp, #ARGS_STACK_OFFSET] orr SRC, SRC, lsl #16 mov STRIDE_S, SRC mov MASK, SRC mov STRIDE_M, SRC .endm .macro src_n_8_init ldrb SRC, [sp, #ARGS_STACK_OFFSET] orr SRC, SRC, lsl #8 orr SRC, SRC, lsl #16 mov STRIDE_S, SRC mov MASK, SRC mov STRIDE_M, SRC .endm .macro fill_process_tail cond, numbytes, firstreg WK4 .req SRC WK5 .req STRIDE_S WK6 .req MASK WK7 .req STRIDE_M pixst \cond, \numbytes, 4, DST .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 0, /* prefetch distance doesn't apply */ \ src_n_8888_init \ nop_macro, /* newline */ \ nop_macro /* cleanup */ \ nop_macro /* process head */ \ fill_process_tail generate_composite_function \ pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 0, /* prefetch distance doesn't apply */ \ src_n_0565_init \ nop_macro, /* newline */ \ nop_macro /* cleanup */ \ nop_macro /* process head */ \ fill_process_tail generate_composite_function \ pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 0, /* prefetch distance doesn't apply */ \ src_n_8_init \ nop_macro, /* newline */ \ nop_macro /* cleanup */ \ nop_macro /* process head */ \ fill_process_tail /******************************************************************************/ .macro src_x888_8888_pixel, cond, reg orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000 .endm .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src .endm .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg src_x888_8888_pixel \cond, %(\firstreg+0) .if \numbytes >= 8 src_x888_8888_pixel \cond, %(\firstreg+1) .if \numbytes == 16 src_x888_8888_pixel \cond, %(\firstreg+2) src_x888_8888_pixel \cond, %(\firstreg+3) .endif .endif .endm generate_composite_function \ pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 3, /* prefetch distance */ \ nop_macro, /* init */ \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ pixman_composite_src_x888_8888_process_head, \ pixman_composite_src_x888_8888_process_tail /******************************************************************************/ .macro src_0565_8888_init /* Hold loop invariants in MASK and STRIDE_M */ ldr MASK, =0x07E007E0 mov STRIDE_M, #0xFF000000 /* Set GE[3:0] to 1010 so SEL instructions do what we want */ ldr SCRATCH, =0x80008000 uadd8 SCRATCH, SCRATCH, SCRATCH .endm .macro src_0565_8888_2pixels, reg1, reg2 and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000 orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB .endm /* This version doesn't need STRIDE_M, but is one instruction longer. It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000 bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb */ .macro src_0565_8888_1pixel, reg bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb .endm .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload .if \numbytes == 16 pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src .elseif \numbytes == 8 pixld , 4, \firstreg, SRC, \unaligned_src .elseif \numbytes == 4 pixld , 2, \firstreg, SRC, \unaligned_src .endif .endm .macro src_0565_8888_process_tail cond, numbytes, firstreg .if \numbytes == 16 src_0565_8888_2pixels \firstreg, %(\firstreg+1) src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3) .elseif \numbytes == 8 src_0565_8888_2pixels \firstreg, %(\firstreg+1) .else src_0565_8888_1pixel \firstreg .endif .endm generate_composite_function \ pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 3, /* prefetch distance */ \ src_0565_8888_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ src_0565_8888_process_head, \ src_0565_8888_process_tail /******************************************************************************/ .macro src_x888_0565_init /* Hold loop invariant in MASK */ ldr MASK, =0x001F001F line_saved_regs STRIDE_S, ORIG_W .endm .macro src_x888_0565_1pixel s, d and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000 orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb /* Top 16 bits are discarded during the following STRH */ .endm .macro src_x888_0565_2pixels slo, shi, d, tmp and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000 orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb .endm .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_S WK5 .req STRIDE_M WK6 .req WK3 WK7 .req ORIG_W .if \numbytes == 16 pixld , 16, 4, SRC, 0 src_x888_0565_2pixels 4, 5, 0, 0 pixld , 8, 4, SRC, 0 src_x888_0565_2pixels 6, 7, 1, 1 pixld , 8, 6, SRC, 0 .else pixld , \numbytes*2, 4, SRC, 0 .endif .endm .macro src_x888_0565_process_tail cond, numbytes, firstreg .if \numbytes == 16 src_x888_0565_2pixels 4, 5, 2, 2 src_x888_0565_2pixels 6, 7, 3, 4 .elseif \numbytes == 8 src_x888_0565_2pixels 4, 5, 1, 1 src_x888_0565_2pixels 6, 7, 2, 2 .elseif \numbytes == 4 src_x888_0565_2pixels 4, 5, 1, 1 .else src_x888_0565_1pixel 4, 1 .endif .if \numbytes == 16 pixst , \numbytes, 0, DST .else pixst , \numbytes, 1, DST .endif .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \ FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 3, /* prefetch distance */ \ src_x888_0565_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ src_x888_0565_process_head, \ src_x888_0565_process_tail /******************************************************************************/ .macro add_8_8_8pixels cond, dst1, dst2 uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M .endm .macro add_8_8_4pixels cond, dst uqadd8\()\cond WK\()\dst, WK\()\dst, MASK .endm .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req MASK WK5 .req STRIDE_M .if \numbytes == 16 pixld \cond, 8, 4, SRC, \unaligned_src pixld \cond, 16, \firstreg, DST, 0 add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) pixld \cond, 8, 4, SRC, \unaligned_src .else pixld \cond, \numbytes, 4, SRC, \unaligned_src pixld \cond, \numbytes, \firstreg, DST, 0 .endif .unreq WK4 .unreq WK5 .endm .macro add_8_8_process_tail cond, numbytes, firstreg .if \numbytes == 16 add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3) .elseif \numbytes == 8 add_8_8_8pixels \cond, \firstreg, %(\firstreg+1) .else add_8_8_4pixels \cond, \firstreg .endif .endm generate_composite_function \ pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ 2, /* prefetch distance */ \ nop_macro, /* init */ \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ add_8_8_process_head, \ add_8_8_process_tail /******************************************************************************/ .macro over_8888_8888_init /* Hold loop invariant in MASK */ ldr MASK, =0x00800080 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, MASK, MASK line_saved_regs STRIDE_D, STRIDE_S, ORIG_W .endm .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_D WK5 .req STRIDE_S WK6 .req STRIDE_M WK7 .req ORIG_W pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ teq WK\()\reg0, #0 .if \numbytes > 4 teqeq WK\()\reg1, #0 .if \numbytes > 8 teqeq WK\()\reg2, #0 teqeq WK\()\reg3, #0 .endif .endif .endm .macro over_8888_8888_prepare next mov WK\()\next, WK\()\next, lsr #24 .endm .macro over_8888_8888_1pixel src, dst, offset, next /* src = destination component multiplier */ rsb WK\()\src, WK\()\src, #255 /* Split even/odd bytes of dst into SCRATCH/dst */ uxtb16 SCRATCH, WK\()\dst uxtb16 WK\()\dst, WK\()\dst, ror #8 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ mla SCRATCH, SCRATCH, WK\()\src, MASK mla WK\()\dst, WK\()\dst, WK\()\src, MASK /* Where we would have had a stall between the result of the first MLA and the shifter input, * reload the complete source pixel */ ldr WK\()\src, [SRC, #\offset] /* Multiply by 257/256 to approximate 256/255 */ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 /* In this stall, start processing the next pixel */ .if \offset < -4 mov WK\()\next, WK\()\next, lsr #24 .endif uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8 /* Recombine even/odd bytes of multiplied destination */ mov SCRATCH, SCRATCH, ror #8 sel WK\()\dst, SCRATCH, WK\()\dst /* Saturated add of source to multiplied destination */ uqadd8 WK\()\dst, WK\()\dst, WK\()\src .endm .macro over_8888_8888_process_tail cond, numbytes, firstreg WK4 .req STRIDE_D WK5 .req STRIDE_S WK6 .req STRIDE_M WK7 .req ORIG_W over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg) beq 10f over_8888_8888_prepare %(4+\firstreg) .set PROCESS_REG, \firstreg .set PROCESS_OFF, -\numbytes .rept \numbytes / 4 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .set PROCESS_OFF, PROCESS_OFF+4 .endr pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 2, /* prefetch distance */ \ over_8888_8888_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ over_8888_8888_process_head, \ over_8888_8888_process_tail /******************************************************************************/ /* Multiply each byte of a word by a byte. * Useful when there aren't any obvious ways to fill the stalls with other instructions. * word Register containing 4 bytes * byte Register containing byte multiplier (bits 8-31 must be 0) * tmp Scratch register * half Register containing the constant 0x00800080 * GE[3:0] bits must contain 0101 */ .macro mul_8888_8 word, byte, tmp, half /* Split even/odd bytes of word apart */ uxtb16 \tmp, \word uxtb16 \word, \word, ror #8 /* Multiply bytes together with rounding, then by 257/256 */ mla \tmp, \tmp, \byte, \half mla \word, \word, \byte, \half /* 1 stall follows */ uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */ uxtab16 \word, \word, \word, ror #8 /* Recombine bytes */ mov \tmp, \tmp, ror #8 sel \word, \tmp, \word .endm /******************************************************************************/ .macro over_8888_n_8888_init /* Mask is constant */ ldr MASK, [sp, #ARGS_STACK_OFFSET+8] /* Hold loop invariant in STRIDE_M */ ldr STRIDE_M, =0x00800080 /* We only want the alpha bits of the constant mask */ mov MASK, MASK, lsr #24 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, STRIDE_M, STRIDE_M line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W .endm .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req Y WK5 .req STRIDE_D WK6 .req STRIDE_S WK7 .req ORIG_W pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm .macro over_8888_n_8888_1pixel src, dst mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M sub WK7, WK6, WK\()\src, lsr #24 mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M uqadd8 WK\()\dst, WK\()\dst, WK\()\src .endm .macro over_8888_n_8888_process_tail cond, numbytes, firstreg WK4 .req Y WK5 .req STRIDE_D WK6 .req STRIDE_S WK7 .req ORIG_W over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg) beq 10f mov WK6, #255 .set PROCESS_REG, \firstreg .rept \numbytes / 4 .if \numbytes == 16 && PROCESS_REG == 2 /* We're using WK6 and WK7 as temporaries, so half way through * 4 pixels, reload the second two source pixels but this time * into WK4 and WK5 */ ldmdb SRC, {WK4, WK5} .endif over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .unreq WK5 .unreq WK6 .unreq WK7 .endm generate_composite_function \ pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 2, /* prefetch distance */ \ over_8888_n_8888_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ over_8888_n_8888_process_head, \ over_8888_n_8888_process_tail /******************************************************************************/ .macro over_n_8_8888_init /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ ldr SRC, [sp, #ARGS_STACK_OFFSET] /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ ldr SCRATCH, =0x00800080 uxtb16 STRIDE_S, SRC uxtb16 SRC, SRC, ror #8 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, SCRATCH, SCRATCH line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W .endm .macro over_n_8_8888_newline ldr STRIDE_D, =0x00800080 b 1f .ltorg 1: .endm .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload WK4 .req STRIDE_M pixld , \numbytes/4, 4, MASK, \unaligned_mask pixld , \numbytes, \firstreg, DST, 0 .unreq WK4 .endm .macro over_n_8_8888_1pixel src, dst uxtb Y, WK4, ror #\src*8 /* Trailing part of multiplication of source */ mla SCRATCH, STRIDE_S, Y, STRIDE_D mla Y, SRC, Y, STRIDE_D mov ORIG_W, #255 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 uxtab16 Y, Y, Y, ror #8 mov SCRATCH, SCRATCH, ror #8 sub ORIG_W, ORIG_W, Y, lsr #24 sel Y, SCRATCH, Y /* Then multiply the destination */ mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D uqadd8 WK\()\dst, WK\()\dst, Y .endm .macro over_n_8_8888_process_tail cond, numbytes, firstreg WK4 .req STRIDE_M teq WK4, #0 beq 10f .set PROCESS_REG, \firstreg .rept \numbytes / 4 over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr pixst , \numbytes, \firstreg, DST 10: .unreq WK4 .endm generate_composite_function \ pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 2, /* prefetch distance */ \ over_n_8_8888_init, \ over_n_8_8888_newline, \ nop_macro, /* cleanup */ \ over_n_8_8888_process_head, \ over_n_8_8888_process_tail /******************************************************************************/ .macro over_reverse_n_8888_init ldr SRC, [sp, #ARGS_STACK_OFFSET] ldr MASK, =0x00800080 /* Split source pixel into RB/AG parts */ uxtb16 STRIDE_S, SRC uxtb16 STRIDE_M, SRC, ror #8 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, MASK, MASK line_saved_regs STRIDE_D, ORIG_W .endm .macro over_reverse_n_8888_newline mov STRIDE_D, #0xFF .endm .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload pixld , \numbytes, \firstreg, DST, 0 .endm .macro over_reverse_n_8888_1pixel d, is_only teq WK\()\d, #0 beq 8f /* replace with source */ bics ORIG_W, STRIDE_D, WK\()\d, lsr #24 .if \is_only == 1 beq 49f /* skip store */ .else beq 9f /* write same value back */ .endif mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 mov SCRATCH, SCRATCH, ror #8 sel ORIG_W, SCRATCH, ORIG_W uqadd8 WK\()\d, WK\()\d, ORIG_W b 9f 8: mov WK\()\d, SRC 9: .endm .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 .if \numbytes == 4 over_reverse_n_8888_1pixel \reg1, 1 .else and SCRATCH, WK\()\reg1, WK\()\reg2 .if \numbytes == 16 and SCRATCH, SCRATCH, WK\()\reg3 and SCRATCH, SCRATCH, WK\()\reg4 .endif mvns SCRATCH, SCRATCH, asr #24 beq 49f /* skip store if all opaque */ over_reverse_n_8888_1pixel \reg1, 0 over_reverse_n_8888_1pixel \reg2, 0 .if \numbytes == 16 over_reverse_n_8888_1pixel \reg3, 0 over_reverse_n_8888_1pixel \reg4, 0 .endif .endif pixst , \numbytes, \reg1, DST 49: .endm .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) .endm generate_composite_function \ pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 3, /* prefetch distance */ \ over_reverse_n_8888_init, \ over_reverse_n_8888_newline, \ nop_macro, /* cleanup */ \ over_reverse_n_8888_process_head, \ over_reverse_n_8888_process_tail /******************************************************************************/ .macro over_white_8888_8888_ca_init HALF .req SRC TMP0 .req STRIDE_D TMP1 .req STRIDE_S TMP2 .req STRIDE_M TMP3 .req ORIG_W WK4 .req SCRATCH line_saved_regs STRIDE_D, STRIDE_M, ORIG_W ldr SCRATCH, =0x800080 mov HALF, #0x80 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, SCRATCH, SCRATCH .set DST_PRELOAD_BIAS, 8 .endm .macro over_white_8888_8888_ca_cleanup .set DST_PRELOAD_BIAS, 0 .unreq HALF .unreq TMP0 .unreq TMP1 .unreq TMP2 .unreq TMP3 .unreq WK4 .endm .macro over_white_8888_8888_ca_combine m, d uxtb16 TMP1, TMP0 /* rb_notmask */ uxtb16 TMP2, \d /* rb_dest; 1 stall follows */ smlatt TMP3, TMP2, TMP1, HALF /* red */ smlabb TMP2, TMP2, TMP1, HALF /* blue */ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */ smlatt \d, TMP1, TMP0, HALF /* alpha */ smlabb TMP1, TMP1, TMP0, HALF /* green */ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ pkhbt TMP1, TMP1, \d, lsl #16 /* ag */ uxtab16 TMP0, TMP0, TMP0, ror #8 uxtab16 TMP1, TMP1, TMP1, ror #8 mov TMP0, TMP0, ror #8 sel \d, TMP0, TMP1 uqadd8 \d, \d, \m /* d is a late result */ .endm .macro over_white_8888_8888_ca_1pixel_head pixld , 4, 1, MASK, 0 pixld , 4, 3, DST, 0 .endm .macro over_white_8888_8888_ca_1pixel_tail mvn TMP0, WK1 teq WK1, WK1, asr #32 bne 01f bcc 03f mov WK3, WK1 b 02f 01: over_white_8888_8888_ca_combine WK1, WK3 02: pixst , 4, 3, DST 03: .endm .macro over_white_8888_8888_ca_2pixels_head pixld , 8, 1, MASK, 0 .endm .macro over_white_8888_8888_ca_2pixels_tail pixld , 8, 3, DST mvn TMP0, WK1 teq WK1, WK1, asr #32 bne 01f movcs WK3, WK1 bcs 02f teq WK2, #0 beq 05f b 02f 01: over_white_8888_8888_ca_combine WK1, WK3 02: mvn TMP0, WK2 teq WK2, WK2, asr #32 bne 03f movcs WK4, WK2 b 04f 03: over_white_8888_8888_ca_combine WK2, WK4 04: pixst , 8, 3, DST 05: .endm .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload .if \numbytes == 4 over_white_8888_8888_ca_1pixel_head .else .if \numbytes == 16 over_white_8888_8888_ca_2pixels_head over_white_8888_8888_ca_2pixels_tail .endif over_white_8888_8888_ca_2pixels_head .endif .endm .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg .if \numbytes == 4 over_white_8888_8888_ca_1pixel_tail .else over_white_8888_8888_ca_2pixels_tail .endif .endm generate_composite_function \ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \ 2, /* prefetch distance */ \ over_white_8888_8888_ca_init, \ nop_macro, /* newline */ \ over_white_8888_8888_ca_cleanup, \ over_white_8888_8888_ca_process_head, \ over_white_8888_8888_ca_process_tail .macro over_n_8888_8888_ca_init /* Set up constants. RB_SRC and AG_SRC are in registers; * RB_FLDS, A_SRC, and the two HALF values need to go on the * stack (and the ful SRC value is already there) */ ldr SCRATCH, [sp, #ARGS_STACK_OFFSET] mov WK0, #0x00FF0000 orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */ mov WK1, #0x80 /* HALF default value */ mov WK2, SCRATCH, lsr #24 /* A_SRC */ orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */ push {WK0-WK3} .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16 uxtb16 SRC, SCRATCH uxtb16 STRIDE_S, SCRATCH, ror #8 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, WK3, WK3 .unreq WK0 .unreq WK1 .unreq WK2 .unreq WK3 WK0 .req Y WK1 .req STRIDE_D RB_SRC .req SRC AG_SRC .req STRIDE_S WK2 .req STRIDE_M RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */ A_SRC .req r8 HALF .req r9 WK3 .req r10 WK4 .req r11 WK5 .req SCRATCH WK6 .req ORIG_W line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W .endm .macro over_n_8888_8888_ca_cleanup add sp, sp, #16 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16 .unreq WK0 .unreq WK1 .unreq RB_SRC .unreq AG_SRC .unreq WK2 .unreq RB_FLDS .unreq A_SRC .unreq HALF .unreq WK3 .unreq WK4 .unreq WK5 .unreq WK6 WK0 .req r8 WK1 .req r9 WK2 .req r10 WK3 .req r11 .endm .macro over_n_8888_8888_ca_1pixel_head pixld , 4, 6, MASK, 0 pixld , 4, 0, DST, 0 .endm .macro over_n_8888_8888_ca_1pixel_tail ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8] uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */ teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */ bne 20f bcc 40f /* Mask is fully opaque (all channels) */ ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */ eors A_SRC, A_SRC, #0xFF bne 10f /* Source is also opaque - same as src_8888_8888 */ mov WK0, WK6 b 30f 10: /* Same as over_8888_8888 */ mul_8888_8 WK0, A_SRC, WK5, HALF uqadd8 WK0, WK0, WK6 b 30f 20: /* No simplifications possible - do it the hard way */ uxtb16 WK2, WK6, ror #8 /* ag_mask */ mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */ mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */ ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET] uxtb16 WK5, WK0 /* rb_dest */ uxtab16 WK3, WK3, WK3, ror #8 uxtb16 WK6, WK0, ror #8 /* ag_dest */ uxtab16 WK4, WK4, WK4, ror #8 smlatt WK0, RB_SRC, WK1, HALF /* red1 */ smlabb WK1, RB_SRC, WK1, HALF /* blue1 */ bic WK3, RB_FLDS, WK3, lsr #8 bic WK4, RB_FLDS, WK4, lsr #8 pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */ smlatt WK0, WK5, WK3, HALF /* red2 */ smlabb WK3, WK5, WK3, HALF /* blue2 */ uxtab16 WK1, WK1, WK1, ror #8 smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */ pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */ smlabb WK0, AG_SRC, WK2, HALF /* green1 */ smlatt WK2, WK6, WK4, HALF /* alpha2 */ smlabb WK4, WK6, WK4, HALF /* green2 */ pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */ uxtab16 WK3, WK3, WK3, ror #8 pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */ uxtab16 WK0, WK0, WK0, ror #8 uxtab16 WK4, WK4, WK4, ror #8 mov WK1, WK1, ror #8 mov WK3, WK3, ror #8 sel WK2, WK1, WK0 /* recombine source*mask */ sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */ uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ 30: /* The destination buffer is already in the L1 cache, so * there's little point in amalgamating writes */ pixst , 4, 0, DST 40: .endm .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload .rept (\numbytes / 4) - 1 over_n_8888_8888_ca_1pixel_head over_n_8888_8888_ca_1pixel_tail .endr over_n_8888_8888_ca_1pixel_head .endm .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg over_n_8888_8888_ca_1pixel_tail .endm pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 ldr ip, [sp] cmp ip, #-1 beq pixman_composite_over_white_8888_8888_ca_asm_armv6 /* else drop through... */ pixman_end_asm_function generate_composite_function \ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ 2, /* prefetch distance */ \ over_n_8888_8888_ca_init, \ nop_macro, /* newline */ \ over_n_8888_8888_ca_cleanup, \ over_n_8888_8888_ca_process_head, \ over_n_8888_8888_ca_process_tail /******************************************************************************/ .macro in_reverse_8888_8888_init /* Hold loop invariant in MASK */ ldr MASK, =0x00800080 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, MASK, MASK /* Offset the source pointer: we only need the alpha bytes */ add SRC, SRC, #3 line_saved_regs ORIG_W .endm .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 ldrb ORIG_W, [SRC], #4 .if \numbytes >= 8 ldrb WK\()\reg1, [SRC], #4 .if \numbytes == 16 ldrb WK\()\reg2, [SRC], #4 ldrb WK\()\reg3, [SRC], #4 .endif .endif add DST, DST, #\numbytes .endm .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2) .endm .macro in_reverse_8888_8888_1pixel s, d, offset, is_only .if \is_only != 1 movs \s, ORIG_W .if \offset != 0 ldrb ORIG_W, [SRC, #\offset] .endif beq 01f teq STRIDE_M, #0xFF beq 02f .endif uxtb16 SCRATCH, \d /* rb_dest */ uxtb16 \d, \d, ror #8 /* ag_dest */ mla SCRATCH, SCRATCH, \s, MASK mla \d, \d, \s, MASK uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 uxtab16 \d, \d, \d, ror #8 mov SCRATCH, SCRATCH, ror #8 sel \d, SCRATCH, \d b 02f .if \offset == 0 48: /* Last mov d,#0 of the set - used as part of shortcut for * source values all 0 */ .endif 01: mov \d, #0 02: .endm .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 .if \numbytes == 4 teq ORIG_W, ORIG_W, asr #32 ldrne WK\()\reg1, [DST, #-4] .elseif \numbytes == 8 teq ORIG_W, WK\()\reg1 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ ldmnedb DST, {WK\()\reg1-WK\()\reg2} .else teq ORIG_W, WK\()\reg1 teqeq ORIG_W, WK\()\reg2 teqeq ORIG_W, WK\()\reg3 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ ldmnedb DST, {WK\()\reg1-WK\()\reg4} .endif cmnne DST, #0 /* clear C if NE */ bcs 49f /* no writes to dest if source all -1 */ beq 48f /* set dest to all 0 if source all 0 */ .if \numbytes == 4 in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1 str WK\()\reg1, [DST, #-4] .elseif \numbytes == 8 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0 stmdb DST, {WK\()\reg1-WK\()\reg2} .else in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0 in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0 stmdb DST, {WK\()\reg1-WK\()\reg4} .endif 49: .endm .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3) .endm generate_composite_function \ pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ 2, /* prefetch distance */ \ in_reverse_8888_8888_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ in_reverse_8888_8888_process_head, \ in_reverse_8888_8888_process_tail /******************************************************************************/ .macro over_n_8888_init ldr SRC, [sp, #ARGS_STACK_OFFSET] /* Hold loop invariant in MASK */ ldr MASK, =0x00800080 /* Hold multiplier for destination in STRIDE_M */ mov STRIDE_M, #255 sub STRIDE_M, STRIDE_M, SRC, lsr #24 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ uadd8 SCRATCH, MASK, MASK .endm .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload pixld , \numbytes, \firstreg, DST, 0 .endm .macro over_n_8888_1pixel dst mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK uqadd8 WK\()\dst, WK\()\dst, SRC .endm .macro over_n_8888_process_tail cond, numbytes, firstreg .set PROCESS_REG, \firstreg .rept \numbytes / 4 over_n_8888_1pixel %(PROCESS_REG) .set PROCESS_REG, PROCESS_REG+1 .endr pixst , \numbytes, \firstreg, DST .endm generate_composite_function \ pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ 2, /* prefetch distance */ \ over_n_8888_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ over_n_8888_process_head, \ over_n_8888_process_tail /******************************************************************************/