src/third-party/base64/lib/arch/neon32/enc_loop.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

#ifdef BASE64_NEON32_USE_ASM
static inline void
enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
{
	// This function duplicates the functionality of enc_loop_neon32_inner,
	// but entirely with inline assembly. This gives a significant speedup
	// over using NEON intrinsics, which do not always generate very good
	// code. The logic of the assembly is directly lifted from the
	// intrinsics version, so it can be used as a guide to this code.

	// Temporary registers, used as scratch space.
	uint8x16_t tmp0, tmp1, tmp2, tmp3;
	uint8x16_t mask0, mask1, mask2, mask3;

	// A lookup table containing the absolute offsets for all ranges.
	const uint8x16_t lut = {
		  65U,  71U, 252U, 252U,
		 252U, 252U, 252U, 252U,
		 252U, 252U, 252U, 252U,
		 237U, 240U,   0U,   0U
	};

	// Numeric constants.
	const uint8x16_t n51 = vdupq_n_u8(51);
	const uint8x16_t n25 = vdupq_n_u8(25);
	const uint8x16_t n63 = vdupq_n_u8(63);

	__asm__ (

		// Load 48 bytes and deinterleave. The bytes are loaded to
		// hard-coded registers q12, q13 and q14, to ensure that they
		// are contiguous. Increment the source pointer.
		"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
		"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"

		// Reshuffle the bytes using temporaries.
		"vshr.u8 %q[t0], q12,    #2      \n\t"
		"vshr.u8 %q[t1], q13,    #4      \n\t"
		"vshr.u8 %q[t2], q14,    #6      \n\t"
		"vsli.8  %q[t1], q12,    #4      \n\t"
		"vsli.8  %q[t2], q13,    #2      \n\t"
		"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
		"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
		"vand.u8 %q[t3], q14,    %q[n63] \n\t"

		// t0..t3 are the reshuffled inputs. Create LUT indices.
		"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
		"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
		"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
		"vqsub.u8 q15, %q[t3], %q[n51] \n\t"

		// Create the mask for range #0.
		"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
		"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
		"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
		"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"

		// Subtract -1 to correct the LUT indices.
		"vsub.u8 q12, %q[m0] \n\t"
		"vsub.u8 q13, %q[m1] \n\t"
		"vsub.u8 q14, %q[m2] \n\t"
		"vsub.u8 q15, %q[m3] \n\t"

		// Lookup the delta values.
		"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
		"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
		"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
		"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
		"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
		"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
		"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
		"vtbl.u8 d31, {%q[lut]}, d31 \n\t"

		// Add the delta values.
		"vadd.u8 q12, %q[t0] \n\t"
		"vadd.u8 q13, %q[t1] \n\t"
		"vadd.u8 q14, %q[t2] \n\t"
		"vadd.u8 q15, %q[t3] \n\t"

		// Store 64 bytes and interleave. Increment the dest pointer.
		"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
		"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"

		// Outputs (modified).
		: [src] "+r"  (*s),
		  [dst] "+r"  (*o),
		  [t0]  "=&w" (tmp0),
		  [t1]  "=&w" (tmp1),
		  [t2]  "=&w" (tmp2),
		  [t3]  "=&w" (tmp3),
		  [m0]  "=&w" (mask0),
		  [m1]  "=&w" (mask1),
		  [m2]  "=&w" (mask2),
		  [m3]  "=&w" (mask3)

		// Inputs (not modified).
		: [lut] "w" (lut),
		  [n25] "w" (n25),
		  [n51] "w" (n51),
		  [n63] "w" (n63)

		// Clobbers.
		: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
	);
}
#endif

static inline void
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
{
#ifdef BASE64_NEON32_USE_ASM
	enc_loop_neon32_inner_asm(s, o);
#else
	// Load 48 bytes and deinterleave:
	uint8x16x3_t src = vld3q_u8(*s);

	// Reshuffle:
	uint8x16x4_t out = enc_reshuffle(src);

	// Translate reshuffled bytes to the Base64 alphabet:
	out = enc_translate(out);

	// Interleave and store output:
	vst4q_u8(*o, out);

	*s += 48;
	*o += 64;
#endif
}

static inline void
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
	size_t rounds = *slen / 48;

	*slen -= rounds * 48;	// 48 bytes consumed per round
	*olen += rounds * 64;	// 64 bytes produced per round

	while (rounds > 0) {
		if (rounds >= 8) {
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			rounds -= 8;
			continue;
		}
		if (rounds >= 4) {
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			rounds -= 4;
			continue;
		}
		if (rounds >= 2) {
			enc_loop_neon32_inner(s, o);
			enc_loop_neon32_inner(s, o);
			rounds -= 2;
			continue;
		}
		enc_loop_neon32_inner(s, o);
		break;
	}
}