6 files changed, 388 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/avx2/codec.c b/src/third-party/base64/lib/arch/avx2/codec.c
new file mode 100644
index 0000000..0498548
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/codec.c
@@ -0,0 +1,42 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX2
+#include <immintrin.h>
+
+#include "dec_reshuffle.c"
+#include "dec_loop.c"
+#include "enc_translate.c"
+#include "enc_reshuffle.c"
+#include "enc_loop.c"
+
+#endif	// HAVE_AVX2
+
+BASE64_ENC_FUNCTION(avx2)
+{
+#if HAVE_AVX2
+	#include "../generic/enc_head.c"
+	enc_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+
+BASE64_DEC_FUNCTION(avx2)
+{
+#if HAVE_AVX2
+	#include "../generic/dec_head.c"
+	dec_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}
diff --git a/src/third-party/base64/lib/arch/avx2/dec_loop.c b/src/third-party/base64/lib/arch/avx2/dec_loop.c
new file mode 100644
index 0000000..f959fc4
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/dec_loop.c
@@ -0,0 +1,110 @@
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const __m256i lut_lo = _mm256_setr_epi8(
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+	const __m256i lut_hi = _mm256_setr_epi8(
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+	const __m256i lut_roll = _mm256_setr_epi8(
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0,
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0);
+
+	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+	// Load input:
+	__m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+	if (!_mm256_testz_si256(lo, hi)) {
+		return 0;
+	}
+
+	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+	// Now simply add the delta values to the input:
+	str = _mm256_add_epi8(str, roll);
+
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
+
+	// Store the output:
+	_mm256_storeu_si256((__m256i *) *o, str);
+
+	*s += 32;
+	*o += 24;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 45) {
+		return;
+	}
+
+	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+	// written after the output, ensure that there will be at least 13
+	// bytes of input data left to cover the gap. (11 data bytes and up to
+	// two end-of-string markers.)
+	size_t rounds = (*slen - 13) / 32;
+
+	*slen -= rounds * 32;	// 32 bytes consumed per round
+	*olen += rounds * 24;	// 24 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_avx2_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 32;
+	*olen -= rounds * 24;
+}
diff --git a/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c
new file mode 100644
index 0000000..f351809
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c
@@ -0,0 +1,34 @@
+static inline __m256i
+dec_reshuffle (const __m256i in)
+{
+	// in, lower lane, bits, upper case are most significant bits, lower
+	// case are least significant bits:
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
+	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+	// 0000eeee FFffffff 0000DDDD DDddEEEE
+	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+	// 00000000 JJJJJJjj KKKKkkkk LLllllll
+	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+	// 00000000 DDDDDDdd EEEEeeee FFffffff
+	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+	// Pack bytes together in each lane:
+	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
+	// 00000000 00000000 00000000 00000000
+	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+	// Pack lanes:
+	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}
diff --git a/src/third-party/base64/lib/arch/avx2/enc_loop.c b/src/third-party/base64/lib/arch/avx2/enc_loop.c
new file mode 100644
index 0000000..b9e2736
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_loop.c
@@ -0,0 +1,89 @@
+static inline void
+enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
+{
+	// First load is done at s - 0 to not get a segfault:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Shift by 4 bytes, as required by enc_reshuffle:
+	src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	// Subsequent loads will be done at s - 4, set pointer for next round:
+	*s += 20;
+	*o += 32;
+}
+
+static inline void
+enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	*s += 24;
+	*o += 32;
+}
+
+static inline void
+enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 32) {
+		return;
+	}
+
+	// Process blocks of 24 bytes at a time. Because blocks are loaded 32
+	// bytes at a time an offset of -4, ensure that there will be at least
+	// 4 remaining bytes after the last round, so that the final read will
+	// not pass beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 4) / 24;
+
+	*slen -= rounds * 24;   // 24 bytes consumed per round
+	*olen += rounds * 32;   // 32 bytes produced per round
+
+	// The first loop iteration requires special handling to ensure that
+	// the read, which is done at an offset, does not underflow the buffer:
+	enc_loop_avx2_inner_first(s, o);
+	rounds--;
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx2_inner(s, o);
+		break;
+	}
+
+	// Add the offset back:
+	*s += 4;
+}
diff --git a/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c
new file mode 100644
index 0000000..ba16690
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c
@@ -0,0 +1,83 @@
+static inline __m256i
+enc_reshuffle (const __m256i input)
+{
+	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
+	// works with shifted (4 bytes) input in order to be able to work
+	// efficiently in the two 128-bit lanes.
+
+	// Input, bytes MSB to LSB:
+	// 0 0 0 0 x w v u t s r q p o n m
+	// l k j i h g f e d c b a 0 0 0 0
+
+	const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
+		10, 11,  9, 10,
+		 7,  8,  6,  7,
+		 4,  5,  3,  4,
+		 1,  2,  0,  1,
+
+		14, 15, 13, 14,
+		11, 12, 10, 11,
+		 8,  9,  7,  8,
+		 5,  6,  4,  5));
+	// in, bytes MSB to LSB:
+	// w x v w
+	// t u s t
+	// q r p q
+	// n o m n
+	// k l j k
+	// h i g h
+	// e f d e
+	// b c a b
+
+	const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
+	// bits, upper case are most significant bits, lower case are least
+	// significant bits.
+	// 0000wwww XX000000 VVVVVV00 00000000
+	// 0000tttt UU000000 SSSSSS00 00000000
+	// 0000qqqq RR000000 PPPPPP00 00000000
+	// 0000nnnn OO000000 MMMMMM00 00000000
+	// 0000kkkk LL000000 JJJJJJ00 00000000
+	// 0000hhhh II000000 GGGGGG00 00000000
+	// 0000eeee FF000000 DDDDDD00 00000000
+	// 0000bbbb CC000000 AAAAAA00 00000000
+
+	const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+	// 00000000 00wwwwXX 00000000 00VVVVVV
+	// 00000000 00ttttUU 00000000 00SSSSSS
+	// 00000000 00qqqqRR 00000000 00PPPPPP
+	// 00000000 00nnnnOO 00000000 00MMMMMM
+	// 00000000 00kkkkLL 00000000 00JJJJJJ
+	// 00000000 00hhhhII 00000000 00GGGGGG
+	// 00000000 00eeeeFF 00000000 00DDDDDD
+	// 00000000 00bbbbCC 00000000 00AAAAAA
+
+	const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
+	// 00000000 00xxxxxx 000000vv WWWW0000
+	// 00000000 00uuuuuu 000000ss TTTT0000
+	// 00000000 00rrrrrr 000000pp QQQQ0000
+	// 00000000 00oooooo 000000mm NNNN0000
+	// 00000000 00llllll 000000jj KKKK0000
+	// 00000000 00iiiiii 000000gg HHHH0000
+	// 00000000 00ffffff 000000dd EEEE0000
+	// 00000000 00cccccc 000000aa BBBB0000
+
+	const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+	// 00xxxxxx 00000000 00vvWWWW 00000000
+	// 00uuuuuu 00000000 00ssTTTT 00000000
+	// 00rrrrrr 00000000 00ppQQQQ 00000000
+	// 00oooooo 00000000 00mmNNNN 00000000
+	// 00llllll 00000000 00jjKKKK 00000000
+	// 00iiiiii 00000000 00ggHHHH 00000000
+	// 00ffffff 00000000 00ddEEEE 00000000
+	// 00cccccc 00000000 00aaBBBB 00000000
+
+	return _mm256_or_si256(t1, t3);
+	// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
+	// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
+	// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
+	// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+}
diff --git a/src/third-party/base64/lib/arch/avx2/enc_translate.c b/src/third-party/base64/lib/arch/avx2/enc_translate.c
new file mode 100644
index 0000000..46173cd
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_translate.c
@@ -0,0 +1,30 @@
+static inline __m256i
+enc_translate (const __m256i in)
+{
+	// A lookup table containing the absolute offsets for all ranges:
+	const __m256i lut = _mm256_setr_epi8(
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+	// Translate values 0..63 to the Base64 alphabet. There are five sets:
+	// #  From      To         Abs    Index  Characters
+	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+	// 3  [62]      [43]       -19       12  +
+	// 4  [63]      [47]       -16       13  /
+
+	// Create LUT indices from the input. The index for range #0 is right,
+	// others are 1 less than expected:
+	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
+
+	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+	const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
+
+	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
+	// now correct:
+	indices = _mm256_sub_epi8(indices, mask);
+
+	// Add offsets to input values:
+	return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
+}