summaryrefslogtreecommitdiffstats
path: root/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
blob: a06b5964ef1092f3cdee96ee779e721fc9e45611 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
 * Copyright © 2008 Mozilla Corporation
 * Copyright © 2010 Nokia Corporation
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of Mozilla Corporation not be used in
 * advertising or publicity pertaining to distribution of the software without
 * specific, written prior permission.  Mozilla Corporation makes no
 * representations about the suitability of this software for any purpose.  It
 * is provided "as is" without express or implied warranty.
 *
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Author:  Jeff Muizelaar (jeff@infidigm.net)
 *
 */

#ifdef __clang__
#define subpls subspl
#endif

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

	.text
	.arch armv6
	.object_arch armv4
	.arm
	.altmacro
	.p2align 2

#include "pixman-arm-asm.h"

/*
 * Note: This code is only using armv5te instructions (not even armv6),
 *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
 *       be split into a few variants, tuned for each microarchitecture.
 *
 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
 * have efficient write combining), it needs to be changed to use 16-byte
 * aligned writes using STM instruction.
 *
 * Nearest scanline scaler macro template uses the following arguments:
 *  fname                     - name of the function to generate
 *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
 *  t                         - type suffix for LDR/STR instructions
 *  prefetch_distance         - prefetch in the source image by that many
 *                              pixels ahead
 *  prefetch_braking_distance - stop prefetching when that many pixels are
 *                              remaining before the end of scanline
 */

.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
                                      prefetch_distance,        \
                                      prefetch_braking_distance

pixman_asm_function \fname
	W		.req	r0
	DST		.req	r1
	SRC		.req	r2
	VX		.req	r3
	UNIT_X		.req	ip
	TMP1		.req	r4
	TMP2		.req	r5
	VXMASK		.req	r6
	PF_OFFS		.req	r7
	SRC_WIDTH_FIXED	.req	r8

	ldr	UNIT_X, [sp]
	push	{r4, r5, r6, r7, r8, r10}
	mvn	VXMASK, #((1 << \bpp_shift) - 1)
	ldr	SRC_WIDTH_FIXED, [sp, #28]

	/* define helper macro */
	.macro	scale_2_pixels
		ldr\()\t	TMP1, [SRC, TMP1]
		and	TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
		adds	VX, VX, UNIT_X
		str\()\t	TMP1, [DST], #(1 << \bpp_shift)
9:		subpls	VX, VX, SRC_WIDTH_FIXED
		bpl	9b

		ldr\()\t	TMP2, [SRC, TMP2]
		and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
		adds	VX, VX, UNIT_X
		str\()\t	TMP2, [DST], #(1 << \bpp_shift)
9:		subpls	VX, VX, SRC_WIDTH_FIXED
		bpl	9b
	.endm

	/* now do the scaling */
	and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
	adds	VX, VX, UNIT_X
9:	subpls	VX, VX, SRC_WIDTH_FIXED
	bpl	9b
	subs	W, W, #(8 + \prefetch_braking_distance)
	blt	2f
	/* calculate prefetch offset */
	mov	PF_OFFS, #\prefetch_distance
	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
1:	/* main loop, process 8 pixels per iteration with prefetch */
	pld	[SRC, PF_OFFS, asr #(16 - \bpp_shift)]
	add	PF_OFFS, UNIT_X, lsl #3
	scale_2_pixels
	scale_2_pixels
	scale_2_pixels
	scale_2_pixels
	subs	W, W, #8
	bge	1b
2:
	subs	W, W, #(4 - 8 - \prefetch_braking_distance)
	blt	2f
1:	/* process the remaining pixels */
	scale_2_pixels
	scale_2_pixels
	subs	W, W, #4
	bge	1b
2:
	tst	W, #2
	beq	2f
	scale_2_pixels
2:
	tst	W, #1
#ifdef __clang__
	ldr\()\t\()ne	TMP1, [SRC, TMP1]
	str\()\t\()ne	TMP1, [DST]
#else
	ldrne\()\t	TMP1, [SRC, TMP1]
	strne\()\t	TMP1, [DST]
#endif
	/* cleanup helper macro */
	.purgem	scale_2_pixels
	.unreq	DST
	.unreq	SRC
	.unreq	W
	.unreq	VX
	.unreq	UNIT_X
	.unreq	TMP1
	.unreq	TMP2
	.unreq	VXMASK
	.unreq	PF_OFFS
	.unreq  SRC_WIDTH_FIXED
	/* return */
	pop	{r4, r5, r6, r7, r8, r10}
	bx	lr
pixman_end_asm_function
.endm

generate_nearest_scanline_func \
    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32

generate_nearest_scanline_func \
    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32