src/third-party/base64/lib/arch/neon64/enc_loop.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

#ifdef BASE64_NEON64_USE_ASM
static inline void
enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
	// This function duplicates the functionality of enc_loop_neon64_inner,
	// but entirely with inline assembly. This gives a significant speedup
	// over using NEON intrinsics, which do not always generate very good
	// code. The logic of the assembly is directly lifted from the
	// intrinsics version, so it can be used as a guide to this code.

	// Temporary registers, used as scratch space.
	uint8x16_t tmp0, tmp1, tmp2, tmp3;

	// Numeric constant.
	const uint8x16_t n63 = vdupq_n_u8(63);

	__asm__ (

		// Load 48 bytes and deinterleave. The bytes are loaded to
		// hard-coded registers v12, v13 and v14, to ensure that they
		// are contiguous. Increment the source pointer.
		"ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t"

		// Reshuffle the bytes using temporaries.
		"ushr %[t0].16b, v12.16b,   #2         \n\t"
		"ushr %[t1].16b, v13.16b,   #4         \n\t"
		"ushr %[t2].16b, v14.16b,   #6         \n\t"
		"sli  %[t1].16b, v12.16b,   #4         \n\t"
		"sli  %[t2].16b, v13.16b,   #2         \n\t"
		"and  %[t1].16b, %[t1].16b, %[n63].16b \n\t"
		"and  %[t2].16b, %[t2].16b, %[n63].16b \n\t"
		"and  %[t3].16b, v14.16b,   %[n63].16b \n\t"

		// Translate the values to the Base64 alphabet.
		"tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t"
		"tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t"
		"tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t"
		"tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t"

		// Store 64 bytes and interleave. Increment the dest pointer.
		"st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t"

		// Outputs (modified).
		: [src] "+r"  (*s),
		  [dst] "+r"  (*o),
		  [t0]  "=&w" (tmp0),
		  [t1]  "=&w" (tmp1),
		  [t2]  "=&w" (tmp2),
		  [t3]  "=&w" (tmp3)

		// Inputs (not modified).
		: [n63] "w" (n63),
		  [l0]  "w" (tbl_enc.val[0]),
		  [l1]  "w" (tbl_enc.val[1]),
		  [l2]  "w" (tbl_enc.val[2]),
		  [l3]  "w" (tbl_enc.val[3])

		// Clobbers.
		: "v12", "v13", "v14", "v15"
	);
}
#endif

static inline void
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
#ifdef BASE64_NEON64_USE_ASM
	enc_loop_neon64_inner_asm(s, o, tbl_enc);
#else
	// Load 48 bytes and deinterleave:
	uint8x16x3_t src = vld3q_u8(*s);

	// Divide bits of three input bytes over four output bytes:
	uint8x16x4_t out = enc_reshuffle(src);

	// The bits have now been shifted to the right locations;
	// translate their values 0..63 to the Base64 alphabet.
	// Use a 64-byte table lookup:
	out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
	out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
	out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
	out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);

	// Interleave and store output:
	vst4q_u8(*o, out);

	*s += 48;
	*o += 64;
#endif
}

static inline void
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
	size_t rounds = *slen / 48;

	*slen -= rounds * 48;	// 48 bytes consumed per round
	*olen += rounds * 64;	// 64 bytes produced per round

	// Load the encoding table:
	const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);

	while (rounds > 0) {
		if (rounds >= 8) {
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			rounds -= 8;
			continue;
		}
		if (rounds >= 4) {
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			rounds -= 4;
			continue;
		}
		if (rounds >= 2) {
			enc_loop_neon64_inner(s, o, tbl_enc);
			enc_loop_neon64_inner(s, o, tbl_enc);
			rounds -= 2;
			continue;
		}
		enc_loop_neon64_inner(s, o, tbl_enc);
		break;
	}
}