4 files changed, 385 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/neon64/codec.c b/src/third-party/base64/lib/arch/neon64/codec.c
new file mode 100644
index 0000000..fc953b2
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon64/codec.c
@@ -0,0 +1,92 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#ifdef __aarch64__
+#  if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
+#    define BASE64_USE_NEON64
+#  endif
+#endif
+
+#ifdef BASE64_USE_NEON64
+#include <arm_neon.h>
+
+// Only enable inline assembly on supported compilers.
+#if defined(__GNUC__) || defined(__clang__)
+#define BASE64_NEON64_USE_ASM
+#endif
+
+static inline uint8x16x4_t
+load_64byte_table (const uint8_t *p)
+{
+#ifdef BASE64_NEON64_USE_ASM
+
+	// Force the table to be loaded into contiguous registers. GCC will not
+	// normally allocate contiguous registers for a `uint8x16x4_t'. These
+	// registers are chosen to not conflict with the ones in the enc loop.
+	register uint8x16_t t0 __asm__ ("v8");
+	register uint8x16_t t1 __asm__ ("v9");
+	register uint8x16_t t2 __asm__ ("v10");
+	register uint8x16_t t3 __asm__ ("v11");
+
+	__asm__ (
+		"ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
+		: [src] "+r" (p),
+		  [t0]  "=w" (t0),
+		  [t1]  "=w" (t1),
+		  [t2]  "=w" (t2),
+		  [t3]  "=w" (t3)
+	);
+
+	return (uint8x16x4_t) {
+		.val[0] = t0,
+		.val[1] = t1,
+		.val[2] = t2,
+		.val[3] = t3,
+	};
+#else
+	return vld1q_u8_x4(p);
+#endif
+}
+
+#include "../generic/32/dec_loop.c"
+#include "../generic/64/enc_loop.c"
+#include "dec_loop.c"
+#include "enc_reshuffle.c"
+#include "enc_loop.c"
+
+#endif	// BASE64_USE_NEON64
+
+// Stride size is so large on these NEON 64-bit functions
+// (48 bytes encode, 64 bytes decode) that we inline the
+// uint64 codec to stay performant on smaller inputs.
+
+BASE64_ENC_FUNCTION(neon64)
+{
+#ifdef BASE64_USE_NEON64
+	#include "../generic/enc_head.c"
+	enc_loop_neon64(&s, &slen, &o, &olen);
+	enc_loop_generic_64(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+
+BASE64_DEC_FUNCTION(neon64)
+{
+#ifdef BASE64_USE_NEON64
+	#include "../generic/dec_head.c"
+	dec_loop_neon64(&s, &slen, &o, &olen);
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}
diff --git a/src/third-party/base64/lib/arch/neon64/dec_loop.c b/src/third-party/base64/lib/arch/neon64/dec_loop.c
new file mode 100644
index 0000000..48232f2
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon64/dec_loop.c
@@ -0,0 +1,129 @@
+// The input consists of five valid character sets in the Base64 alphabet,
+// which we need to map back to the 6-bit values they represent.
+// There are three ranges, two singles, and then there's the rest.
+//
+//   #  From       To        LUT  Characters
+//   1  [0..42]    [255]      #1  invalid input
+//   2  [43]       [62]       #1  +
+//   3  [44..46]   [255]      #1  invalid input
+//   4  [47]       [63]       #1  /
+//   5  [48..57]   [52..61]   #1  0..9
+//   6  [58..63]   [255]      #1  invalid input
+//   7  [64]       [255]      #2  invalid input
+//   8  [65..90]   [0..25]    #2  A..Z
+//   9  [91..96]   [255]      #2  invalid input
+//  10  [97..122]  [26..51]   #2  a..z
+//  11  [123..126] [255]      #2  invalid input
+// (12) Everything else => invalid input
+
+// The first LUT will use the VTBL instruction (out of range indices are set to
+// 0 in destination).
+static const uint8_t dec_lut1[] = {
+	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
+	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
+	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,  62U, 255U, 255U, 255U,  63U,
+	 52U,  53U,  54U,  55U,  56U,  57U,  58U,  59U,  60U,  61U, 255U, 255U, 255U, 255U, 255U, 255U,
+};
+
+// The second LUT will use the VTBX instruction (out of range indices will be
+// unchanged in destination). Input [64..126] will be mapped to index [1..63]
+// in this LUT. Index 0 means that value comes from LUT #1.
+static const uint8_t dec_lut2[] = {
+	  0U, 255U,   0U,   1U,   2U,   3U,   4U,   5U,   6U,   7U,   8U,   9U,  10U,  11U,  12U,  13U,
+	 14U,  15U,  16U,  17U,  18U,  19U,  20U,  21U,  22U,  23U,  24U,  25U, 255U, 255U, 255U, 255U,
+	255U, 255U,  26U,  27U,  28U,  29U,  30U,  31U,  32U,  33U,  34U,  35U,  36U,  37U,  38U,  39U,
+	 40U,  41U,  42U,  43U,  44U,  45U,  46U,  47U,  48U,  49U,  50U,  51U, 255U, 255U, 255U, 255U,
+};
+
+// All input values in range for the first look-up will be 0U in the second
+// look-up result. All input values out of range for the first look-up will be
+// 0U in the first look-up result. Thus, the two results can be ORed without
+// conflicts.
+//
+// Invalid characters that are in the valid range for either look-up will be
+// set to 255U in the combined result. Other invalid characters will just be
+// passed through with the second look-up result (using the VTBX instruction).
+// Since the second LUT is 64 bytes, those passed-through values are guaranteed
+// to have a value greater than 63U. Therefore, valid characters will be mapped
+// to the valid [0..63] range and all invalid characters will be mapped to
+// values greater than 63.
+
+static inline void
+dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+	// extra trailing zero bytes are written, so it is not necessary to
+	// reserve extra input bytes:
+	size_t rounds = *slen / 64;
+
+	*slen -= rounds * 64;	// 64 bytes consumed per round
+	*olen += rounds * 48;	// 48 bytes produced per round
+
+	const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
+	const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
+
+	do {
+		const uint8x16_t offset = vdupq_n_u8(63U);
+		uint8x16x4_t dec1, dec2;
+		uint8x16x3_t dec;
+
+		// Load 64 bytes and deinterleave:
+		uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
+
+		// Get indices for second LUT:
+		dec2.val[0] = vqsubq_u8(str.val[0], offset);
+		dec2.val[1] = vqsubq_u8(str.val[1], offset);
+		dec2.val[2] = vqsubq_u8(str.val[2], offset);
+		dec2.val[3] = vqsubq_u8(str.val[3], offset);
+
+		// Get values from first LUT:
+		dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
+		dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
+		dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
+		dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
+
+		// Get values from second LUT:
+		dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
+		dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
+		dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
+		dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
+
+		// Get final values:
+		str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
+		str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
+		str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
+		str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
+
+		// Check for invalid input, any value larger than 63:
+		const uint8x16_t classified
+			= vcgtq_u8(str.val[0], vdupq_n_u8(63))
+			| vcgtq_u8(str.val[1], vdupq_n_u8(63))
+			| vcgtq_u8(str.val[2], vdupq_n_u8(63))
+			| vcgtq_u8(str.val[3], vdupq_n_u8(63));
+
+		// Check that all bits are zero:
+		if (vmaxvq_u8(classified) != 0U) {
+			break;
+		}
+
+		// Compress four bytes into three:
+		dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
+		dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
+		dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
+
+		// Interleave and store decoded result:
+		vst3q_u8((uint8_t *) *o, dec);
+
+		*s += 64;
+		*o += 48;
+
+	} while (--rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 64;
+	*olen -= rounds * 48;
+}
diff --git a/src/third-party/base64/lib/arch/neon64/enc_loop.c b/src/third-party/base64/lib/arch/neon64/enc_loop.c
new file mode 100644
index 0000000..d1862f7
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon64/enc_loop.c
@@ -0,0 +1,133 @@
+#ifdef BASE64_NEON64_USE_ASM
+static inline void
+enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
+{
+	// This function duplicates the functionality of enc_loop_neon64_inner,
+	// but entirely with inline assembly. This gives a significant speedup
+	// over using NEON intrinsics, which do not always generate very good
+	// code. The logic of the assembly is directly lifted from the
+	// intrinsics version, so it can be used as a guide to this code.
+
+	// Temporary registers, used as scratch space.
+	uint8x16_t tmp0, tmp1, tmp2, tmp3;
+
+	// Numeric constant.
+	const uint8x16_t n63 = vdupq_n_u8(63);
+
+	__asm__ (
+
+		// Load 48 bytes and deinterleave. The bytes are loaded to
+		// hard-coded registers v12, v13 and v14, to ensure that they
+		// are contiguous. Increment the source pointer.
+		"ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t"
+
+		// Reshuffle the bytes using temporaries.
+		"ushr %[t0].16b, v12.16b,   #2         \n\t"
+		"ushr %[t1].16b, v13.16b,   #4         \n\t"
+		"ushr %[t2].16b, v14.16b,   #6         \n\t"
+		"sli  %[t1].16b, v12.16b,   #4         \n\t"
+		"sli  %[t2].16b, v13.16b,   #2         \n\t"
+		"and  %[t1].16b, %[t1].16b, %[n63].16b \n\t"
+		"and  %[t2].16b, %[t2].16b, %[n63].16b \n\t"
+		"and  %[t3].16b, v14.16b,   %[n63].16b \n\t"
+
+		// Translate the values to the Base64 alphabet.
+		"tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t"
+		"tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t"
+		"tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t"
+		"tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t"
+
+		// Store 64 bytes and interleave. Increment the dest pointer.
+		"st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t"
+
+		// Outputs (modified).
+		: [src] "+r"  (*s),
+		  [dst] "+r"  (*o),
+		  [t0]  "=&w" (tmp0),
+		  [t1]  "=&w" (tmp1),
+		  [t2]  "=&w" (tmp2),
+		  [t3]  "=&w" (tmp3)
+
+		// Inputs (not modified).
+		: [n63] "w" (n63),
+		  [l0]  "w" (tbl_enc.val[0]),
+		  [l1]  "w" (tbl_enc.val[1]),
+		  [l2]  "w" (tbl_enc.val[2]),
+		  [l3]  "w" (tbl_enc.val[3])
+
+		// Clobbers.
+		: "v12", "v13", "v14", "v15"
+	);
+}
+#endif
+
+static inline void
+enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
+{
+#ifdef BASE64_NEON64_USE_ASM
+	enc_loop_neon64_inner_asm(s, o, tbl_enc);
+#else
+	// Load 48 bytes and deinterleave:
+	uint8x16x3_t src = vld3q_u8(*s);
+
+	// Divide bits of three input bytes over four output bytes:
+	uint8x16x4_t out = enc_reshuffle(src);
+
+	// The bits have now been shifted to the right locations;
+	// translate their values 0..63 to the Base64 alphabet.
+	// Use a 64-byte table lookup:
+	out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
+	out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
+	out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
+	out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
+
+	// Interleave and store output:
+	vst4q_u8(*o, out);
+
+	*s += 48;
+	*o += 64;
+#endif
+}
+
+static inline void
+enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
+
+	*slen -= rounds * 48;	// 48 bytes consumed per round
+	*olen += rounds * 64;	// 64 bytes produced per round
+
+	// Load the encoding table:
+	const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon64_inner(s, o, tbl_enc);
+		break;
+	}
+}
diff --git a/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c b/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c
new file mode 100644
index 0000000..ea543e0
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c
@@ -0,0 +1,31 @@
+static inline uint8x16x4_t
+enc_reshuffle (const uint8x16x3_t in)
+{
+	uint8x16x4_t out;
+
+	// Input:
+	// in[0]  = a7 a6 a5 a4 a3 a2 a1 a0
+	// in[1]  = b7 b6 b5 b4 b3 b2 b1 b0
+	// in[2]  = c7 c6 c5 c4 c3 c2 c1 c0
+
+	// Output:
+	// out[0] = 00 00 a7 a6 a5 a4 a3 a2
+	// out[1] = 00 00 a1 a0 b7 b6 b5 b4
+	// out[2] = 00 00 b3 b2 b1 b0 c7 c6
+	// out[3] = 00 00 c5 c4 c3 c2 c1 c0
+
+	// Move the input bits to where they need to be in the outputs. Except
+	// for the first output, the high two bits are not cleared.
+	out.val[0] = vshrq_n_u8(in.val[0], 2);
+	out.val[1] = vshrq_n_u8(in.val[1], 4);
+	out.val[2] = vshrq_n_u8(in.val[2], 6);
+	out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
+	out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
+
+	// Clear the high two bits in the second, third and fourth output.
+	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
+	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
+	out.val[3] = vandq_u8(in.val[2],  vdupq_n_u8(0x3F));
+
+	return out;
+}