From 62e4c68907d8d33709c2c1f92a161dff00b3d5f2 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 22:01:36 +0200 Subject: Adding upstream version 0.11.2. Signed-off-by: Daniel Baumann --- src/third-party/base64/lib/arch/avx/codec.c | 42 +++++ src/third-party/base64/lib/arch/avx2/codec.c | 42 +++++ src/third-party/base64/lib/arch/avx2/dec_loop.c | 110 +++++++++++++ .../base64/lib/arch/avx2/dec_reshuffle.c | 34 ++++ src/third-party/base64/lib/arch/avx2/enc_loop.c | 89 +++++++++++ .../base64/lib/arch/avx2/enc_reshuffle.c | 83 ++++++++++ .../base64/lib/arch/avx2/enc_translate.c | 30 ++++ .../base64/lib/arch/generic/32/dec_loop.c | 86 ++++++++++ .../base64/lib/arch/generic/32/enc_loop.c | 73 +++++++++ .../base64/lib/arch/generic/64/enc_loop.c | 77 +++++++++ src/third-party/base64/lib/arch/generic/codec.c | 39 +++++ src/third-party/base64/lib/arch/generic/dec_head.c | 37 +++++ src/third-party/base64/lib/arch/generic/dec_tail.c | 91 +++++++++++ src/third-party/base64/lib/arch/generic/enc_head.c | 24 +++ src/third-party/base64/lib/arch/generic/enc_tail.c | 34 ++++ src/third-party/base64/lib/arch/neon32/codec.c | 77 +++++++++ src/third-party/base64/lib/arch/neon32/dec_loop.c | 106 +++++++++++++ src/third-party/base64/lib/arch/neon32/enc_loop.c | 169 ++++++++++++++++++++ .../base64/lib/arch/neon32/enc_reshuffle.c | 31 ++++ .../base64/lib/arch/neon32/enc_translate.c | 57 +++++++ src/third-party/base64/lib/arch/neon64/codec.c | 92 +++++++++++ src/third-party/base64/lib/arch/neon64/dec_loop.c | 129 +++++++++++++++ src/third-party/base64/lib/arch/neon64/enc_loop.c | 133 ++++++++++++++++ .../base64/lib/arch/neon64/enc_reshuffle.c | 31 ++++ src/third-party/base64/lib/arch/sse41/codec.c | 42 +++++ src/third-party/base64/lib/arch/sse42/codec.c | 42 +++++ src/third-party/base64/lib/arch/ssse3/codec.c | 42 +++++ src/third-party/base64/lib/arch/ssse3/dec_loop.c | 173 +++++++++++++++++++++ .../base64/lib/arch/ssse3/dec_reshuffle.c | 33 ++++ src/third-party/base64/lib/arch/ssse3/enc_loop.c | 67 ++++++++ .../base64/lib/arch/ssse3/enc_reshuffle.c | 48 ++++++ .../base64/lib/arch/ssse3/enc_translate.c | 33 ++++ 32 files changed, 2196 insertions(+) create mode 100644 src/third-party/base64/lib/arch/avx/codec.c create mode 100644 src/third-party/base64/lib/arch/avx2/codec.c create mode 100644 src/third-party/base64/lib/arch/avx2/dec_loop.c create mode 100644 src/third-party/base64/lib/arch/avx2/dec_reshuffle.c create mode 100644 src/third-party/base64/lib/arch/avx2/enc_loop.c create mode 100644 src/third-party/base64/lib/arch/avx2/enc_reshuffle.c create mode 100644 src/third-party/base64/lib/arch/avx2/enc_translate.c create mode 100644 src/third-party/base64/lib/arch/generic/32/dec_loop.c create mode 100644 src/third-party/base64/lib/arch/generic/32/enc_loop.c create mode 100644 src/third-party/base64/lib/arch/generic/64/enc_loop.c create mode 100644 src/third-party/base64/lib/arch/generic/codec.c create mode 100644 src/third-party/base64/lib/arch/generic/dec_head.c create mode 100644 src/third-party/base64/lib/arch/generic/dec_tail.c create mode 100644 src/third-party/base64/lib/arch/generic/enc_head.c create mode 100644 src/third-party/base64/lib/arch/generic/enc_tail.c create mode 100644 src/third-party/base64/lib/arch/neon32/codec.c create mode 100644 src/third-party/base64/lib/arch/neon32/dec_loop.c create mode 100644 src/third-party/base64/lib/arch/neon32/enc_loop.c create mode 100644 src/third-party/base64/lib/arch/neon32/enc_reshuffle.c create mode 100644 src/third-party/base64/lib/arch/neon32/enc_translate.c create mode 100644 src/third-party/base64/lib/arch/neon64/codec.c create mode 100644 src/third-party/base64/lib/arch/neon64/dec_loop.c create mode 100644 src/third-party/base64/lib/arch/neon64/enc_loop.c create mode 100644 src/third-party/base64/lib/arch/neon64/enc_reshuffle.c create mode 100644 src/third-party/base64/lib/arch/sse41/codec.c create mode 100644 src/third-party/base64/lib/arch/sse42/codec.c create mode 100644 src/third-party/base64/lib/arch/ssse3/codec.c create mode 100644 src/third-party/base64/lib/arch/ssse3/dec_loop.c create mode 100644 src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c create mode 100644 src/third-party/base64/lib/arch/ssse3/enc_loop.c create mode 100644 src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c create mode 100644 src/third-party/base64/lib/arch/ssse3/enc_translate.c (limited to 'src/third-party/base64/lib/arch') diff --git a/src/third-party/base64/lib/arch/avx/codec.c b/src/third-party/base64/lib/arch/avx/codec.c new file mode 100644 index 0000000..a7a963d --- /dev/null +++ b/src/third-party/base64/lib/arch/avx/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX +#include + +#include "../ssse3/dec_reshuffle.c" +#include "../ssse3/dec_loop.c" +#include "../ssse3/enc_translate.c" +#include "../ssse3/enc_reshuffle.c" +#include "../ssse3/enc_loop.c" + +#endif // HAVE_AVX + +BASE64_ENC_FUNCTION(avx) +{ +#if HAVE_AVX + #include "../generic/enc_head.c" + enc_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(avx) +{ +#if HAVE_AVX + #include "../generic/dec_head.c" + dec_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/avx2/codec.c b/src/third-party/base64/lib/arch/avx2/codec.c new file mode 100644 index 0000000..0498548 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX2 +#include + +#include "dec_reshuffle.c" +#include "dec_loop.c" +#include "enc_translate.c" +#include "enc_reshuffle.c" +#include "enc_loop.c" + +#endif // HAVE_AVX2 + +BASE64_ENC_FUNCTION(avx2) +{ +#if HAVE_AVX2 + #include "../generic/enc_head.c" + enc_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(avx2) +{ +#if HAVE_AVX2 + #include "../generic/dec_head.c" + dec_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/avx2/dec_loop.c b/src/third-party/base64/lib/arch/avx2/dec_loop.c new file mode 100644 index 0000000..f959fc4 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/dec_loop.c @@ -0,0 +1,110 @@ +static inline int +dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const __m256i lut_lo = _mm256_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m256i lut_hi = _mm256_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m256i lut_roll = _mm256_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m256i mask_2F = _mm256_set1_epi8(0x2F); + + // Load input: + __m256i str = _mm256_loadu_si256((__m256i *) *s); + + // See the SSSE3 decoder for an explanation of the algorithm. + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + + if (!_mm256_testz_si256(lo, hi)) { + return 0; + } + + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm256_add_epi8(str, roll); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store the output: + _mm256_storeu_si256((__m256i *) *o, str); + + *s += 32; + *o += 24; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 45) { + return; + } + + // Process blocks of 32 bytes per round. Because 8 extra zero bytes are + // written after the output, ensure that there will be at least 13 + // bytes of input data left to cover the gap. (11 data bytes and up to + // two end-of-string markers.) + size_t rounds = (*slen - 13) / 32; + + *slen -= rounds * 32; // 32 bytes consumed per round + *olen += rounds * 24; // 24 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_avx2_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 32; + *olen -= rounds * 24; +} diff --git a/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c new file mode 100644 index 0000000..f351809 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c @@ -0,0 +1,34 @@ +static inline __m256i +dec_reshuffle (const __m256i in) +{ + // in, lower lane, bits, upper case are most significant bits, lower + // case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together in each lane: + out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes: + return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); +} diff --git a/src/third-party/base64/lib/arch/avx2/enc_loop.c b/src/third-party/base64/lib/arch/avx2/enc_loop.c new file mode 100644 index 0000000..b9e2736 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/enc_loop.c @@ -0,0 +1,89 @@ +static inline void +enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) +{ + // First load is done at s - 0 to not get a segfault: + __m256i src = _mm256_loadu_si256((__m256i *) *s); + + // Shift by 4 bytes, as required by enc_reshuffle: + src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); + + // Reshuffle, translate, store: + src = enc_reshuffle(src); + src = enc_translate(src); + _mm256_storeu_si256((__m256i *) *o, src); + + // Subsequent loads will be done at s - 4, set pointer for next round: + *s += 20; + *o += 32; +} + +static inline void +enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m256i src = _mm256_loadu_si256((__m256i *) *s); + + // Reshuffle, translate, store: + src = enc_reshuffle(src); + src = enc_translate(src); + _mm256_storeu_si256((__m256i *) *o, src); + + *s += 24; + *o += 32; +} + +static inline void +enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 32) { + return; + } + + // Process blocks of 24 bytes at a time. Because blocks are loaded 32 + // bytes at a time an offset of -4, ensure that there will be at least + // 4 remaining bytes after the last round, so that the final read will + // not pass beyond the bounds of the input buffer: + size_t rounds = (*slen - 4) / 24; + + *slen -= rounds * 24; // 24 bytes consumed per round + *olen += rounds * 32; // 32 bytes produced per round + + // The first loop iteration requires special handling to ensure that + // the read, which is done at an offset, does not underflow the buffer: + enc_loop_avx2_inner_first(s, o); + rounds--; + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx2_inner(s, o); + break; + } + + // Add the offset back: + *s += 4; +} diff --git a/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c new file mode 100644 index 0000000..ba16690 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c @@ -0,0 +1,83 @@ +static inline __m256i +enc_reshuffle (const __m256i input) +{ + // Translation of the SSSE3 reshuffling algorithm to AVX2. This one + // works with shifted (4 bytes) input in order to be able to work + // efficiently in the two 128-bit lanes. + + // Input, bytes MSB to LSB: + // 0 0 0 0 x w v u t s r q p o n m + // l k j i h g f e d c b a 0 0 0 0 + + const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8( + 10, 11, 9, 10, + 7, 8, 6, 7, + 4, 5, 3, 4, + 1, 2, 0, 1, + + 14, 15, 13, 14, + 11, 12, 10, 11, + 8, 9, 7, 8, + 5, 6, 4, 5)); + // in, bytes MSB to LSB: + // w x v w + // t u s t + // q r p q + // n o m n + // k l j k + // h i g h + // e f d e + // b c a b + + const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00)); + // bits, upper case are most significant bits, lower case are least + // significant bits. + // 0000wwww XX000000 VVVVVV00 00000000 + // 0000tttt UU000000 SSSSSS00 00000000 + // 0000qqqq RR000000 PPPPPP00 00000000 + // 0000nnnn OO000000 MMMMMM00 00000000 + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); + // 00000000 00wwwwXX 00000000 00VVVVVV + // 00000000 00ttttUU 00000000 00SSSSSS + // 00000000 00qqqqRR 00000000 00PPPPPP + // 00000000 00nnnnOO 00000000 00MMMMMM + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0)); + // 00000000 00xxxxxx 000000vv WWWW0000 + // 00000000 00uuuuuu 000000ss TTTT0000 + // 00000000 00rrrrrr 000000pp QQQQ0000 + // 00000000 00oooooo 000000mm NNNN0000 + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); + // 00xxxxxx 00000000 00vvWWWW 00000000 + // 00uuuuuu 00000000 00ssTTTT 00000000 + // 00rrrrrr 00000000 00ppQQQQ 00000000 + // 00oooooo 00000000 00mmNNNN 00000000 + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + + return _mm256_or_si256(t1, t3); + // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV + // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS + // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP + // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA +} diff --git a/src/third-party/base64/lib/arch/avx2/enc_translate.c b/src/third-party/base64/lib/arch/avx2/enc_translate.c new file mode 100644 index 0000000..46173cd --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/enc_translate.c @@ -0,0 +1,30 @@ +static inline __m256i +enc_translate (const __m256i in) +{ + // A lookup table containing the absolute offsets for all ranges: + const __m256i lut = _mm256_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from the input. The index for range #0 is right, + // others are 1 less than expected: + __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25)); + + // Subtract -1, so add 1 to indices for range #[1..4]. All indices are + // now correct: + indices = _mm256_sub_epi8(indices, mask); + + // Add offsets to input values: + return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); +} diff --git a/src/third-party/base64/lib/arch/generic/32/dec_loop.c b/src/third-party/base64/lib/arch/generic/32/dec_loop.c new file mode 100644 index 0000000..8a8260f --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/32/dec_loop.c @@ -0,0 +1,86 @@ +static inline int +dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const uint32_t str + = base64_table_dec_32bit_d0[(*s)[0]] + | base64_table_dec_32bit_d1[(*s)[1]] + | base64_table_dec_32bit_d2[(*s)[2]] + | base64_table_dec_32bit_d3[(*s)[3]]; + +#if BASE64_LITTLE_ENDIAN + + // LUTs for little-endian set MSB in case of invalid character: + if (str & UINT32_C(0x80000000)) { + return 0; + } +#else + // LUTs for big-endian set LSB in case of invalid character: + if (str & UINT32_C(1)) { + return 0; + } +#endif + // Store the output: + memcpy(*o, &str, sizeof (str)); + + *s += 4; + *o += 3; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 8) { + return; + } + + // Process blocks of 4 bytes per round. Because one extra zero byte is + // written after the output, ensure that there will be at least 4 bytes + // of input data left to cover the gap. (Two data bytes and up to two + // end-of-string markers.) + size_t rounds = (*slen - 4) / 4; + + *slen -= rounds * 4; // 4 bytes consumed per round + *olen += rounds * 3; // 3 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_generic_32_inner(s, o, &rounds) && + dec_loop_generic_32_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_generic_32_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 4; + *olen -= rounds * 3; +} diff --git a/src/third-party/base64/lib/arch/generic/32/enc_loop.c b/src/third-party/base64/lib/arch/generic/32/enc_loop.c new file mode 100644 index 0000000..f4870a7 --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/32/enc_loop.c @@ -0,0 +1,73 @@ +static inline void +enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o) +{ + uint32_t src; + + // Load input: + memcpy(&src, *s, sizeof (src)); + + // Reorder to 32-bit big-endian, if not already in that format. The + // workset must be in big-endian, otherwise the shifted bits do not + // carry over properly among adjacent bytes: + src = BASE64_HTOBE32(src); + + // Two indices for the 12-bit lookup table: + const size_t index0 = (src >> 20) & 0xFFFU; + const size_t index1 = (src >> 8) & 0xFFFU; + + // Table lookup and store: + memcpy(*o + 0, base64_table_enc_12bit + index0, 2); + memcpy(*o + 2, base64_table_enc_12bit + index1, 2); + + *s += 3; + *o += 4; +} + +static inline void +enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 4) { + return; + } + + // Process blocks of 3 bytes at a time. Because blocks are loaded 4 + // bytes at a time, ensure that there will be at least one remaining + // byte after the last round, so that the final read will not pass + // beyond the bounds of the input buffer: + size_t rounds = (*slen - 1) / 3; + + *slen -= rounds * 3; // 3 bytes consumed per round + *olen += rounds * 4; // 4 bytes produced per round + + do { + if (rounds >= 8) { + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_generic_32_inner(s, o); + enc_loop_generic_32_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_generic_32_inner(s, o); + break; + + } while (rounds > 0); +} diff --git a/src/third-party/base64/lib/arch/generic/64/enc_loop.c b/src/third-party/base64/lib/arch/generic/64/enc_loop.c new file mode 100644 index 0000000..0840bc7 --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/64/enc_loop.c @@ -0,0 +1,77 @@ +static inline void +enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o) +{ + uint64_t src; + + // Load input: + memcpy(&src, *s, sizeof (src)); + + // Reorder to 64-bit big-endian, if not already in that format. The + // workset must be in big-endian, otherwise the shifted bits do not + // carry over properly among adjacent bytes: + src = BASE64_HTOBE64(src); + + // Four indices for the 12-bit lookup table: + const size_t index0 = (src >> 52) & 0xFFFU; + const size_t index1 = (src >> 40) & 0xFFFU; + const size_t index2 = (src >> 28) & 0xFFFU; + const size_t index3 = (src >> 16) & 0xFFFU; + + // Table lookup and store: + memcpy(*o + 0, base64_table_enc_12bit + index0, 2); + memcpy(*o + 2, base64_table_enc_12bit + index1, 2); + memcpy(*o + 4, base64_table_enc_12bit + index2, 2); + memcpy(*o + 6, base64_table_enc_12bit + index3, 2); + + *s += 6; + *o += 8; +} + +static inline void +enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 8) { + return; + } + + // Process blocks of 6 bytes at a time. Because blocks are loaded 8 + // bytes at a time, ensure that there will be at least 2 remaining + // bytes after the last round, so that the final read will not pass + // beyond the bounds of the input buffer: + size_t rounds = (*slen - 2) / 6; + + *slen -= rounds * 6; // 6 bytes consumed per round + *olen += rounds * 8; // 8 bytes produced per round + + do { + if (rounds >= 8) { + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_generic_64_inner(s, o); + enc_loop_generic_64_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_generic_64_inner(s, o); + break; + + } while (rounds > 0); +} diff --git a/src/third-party/base64/lib/arch/generic/codec.c b/src/third-party/base64/lib/arch/generic/codec.c new file mode 100644 index 0000000..8dd5af2 --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/codec.c @@ -0,0 +1,39 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if BASE64_WORDSIZE == 32 +# include "32/enc_loop.c" +#elif BASE64_WORDSIZE == 64 +# include "64/enc_loop.c" +#endif + +#if BASE64_WORDSIZE >= 32 +# include "32/dec_loop.c" +#endif + +BASE64_ENC_FUNCTION(plain) +{ + #include "enc_head.c" +#if BASE64_WORDSIZE == 32 + enc_loop_generic_32(&s, &slen, &o, &olen); +#elif BASE64_WORDSIZE == 64 + enc_loop_generic_64(&s, &slen, &o, &olen); +#endif + #include "enc_tail.c" +} + +BASE64_DEC_FUNCTION(plain) +{ + #include "dec_head.c" +#if BASE64_WORDSIZE >= 32 + dec_loop_generic_32(&s, &slen, &o, &olen); +#endif + #include "dec_tail.c" +} diff --git a/src/third-party/base64/lib/arch/generic/dec_head.c b/src/third-party/base64/lib/arch/generic/dec_head.c new file mode 100644 index 0000000..179a31b --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/dec_head.c @@ -0,0 +1,37 @@ +int ret = 0; +const uint8_t *s = (const uint8_t *) src; +uint8_t *o = (uint8_t *) out; +uint8_t q; + +// Use local temporaries to avoid cache thrashing: +size_t olen = 0; +size_t slen = srclen; +struct base64_state st; +st.eof = state->eof; +st.bytes = state->bytes; +st.carry = state->carry; + +// If we previously saw an EOF or an invalid character, bail out: +if (st.eof) { + *outlen = 0; + ret = 0; + // If there was a trailing '=' to check, check it: + if (slen && (st.eof == BASE64_AEOF)) { + state->bytes = 0; + state->eof = BASE64_EOF; + ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0; + } + return ret; +} + +// Turn four 6-bit numbers into three bytes: +// out[0] = 11111122 +// out[1] = 22223333 +// out[2] = 33444444 + +// Duff's device again: +switch (st.bytes) +{ + for (;;) + { + case 0: diff --git a/src/third-party/base64/lib/arch/generic/dec_tail.c b/src/third-party/base64/lib/arch/generic/dec_tail.c new file mode 100644 index 0000000..e64f724 --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/dec_tail.c @@ -0,0 +1,91 @@ + if (slen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*s++]) >= 254) { + st.eof = BASE64_EOF; + // Treat character '=' as invalid for byte 0: + break; + } + st.carry = q << 2; + st.bytes++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 1: if (slen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*s++]) >= 254) { + st.eof = BASE64_EOF; + // Treat character '=' as invalid for byte 1: + break; + } + *o++ = st.carry | (q >> 4); + st.carry = q << 4; + st.bytes++; + olen++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 2: if (slen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*s++]) >= 254) { + st.bytes++; + // When q == 254, the input char is '='. + // Check if next byte is also '=': + if (q == 254) { + if (slen-- != 0) { + st.bytes = 0; + // EOF: + st.eof = BASE64_EOF; + q = base64_table_dec_8bit[*s++]; + ret = ((q == 254) && (slen == 0)) ? 1 : 0; + break; + } + else { + // Almost EOF + st.eof = BASE64_AEOF; + ret = 1; + break; + } + } + // If we get here, there was an error: + break; + } + *o++ = st.carry | (q >> 2); + st.carry = q << 6; + st.bytes++; + olen++; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 3: if (slen-- == 0) { + ret = 1; + break; + } + if ((q = base64_table_dec_8bit[*s++]) >= 254) { + st.bytes = 0; + st.eof = BASE64_EOF; + // When q == 254, the input char is '='. Return 1 and EOF. + // When q == 255, the input char is invalid. Return 0 and EOF. + ret = ((q == 254) && (slen == 0)) ? 1 : 0; + break; + } + *o++ = st.carry | q; + st.carry = 0; + st.bytes = 0; + olen++; + } +} + +state->eof = st.eof; +state->bytes = st.bytes; +state->carry = st.carry; +*outlen = olen; +return ret; diff --git a/src/third-party/base64/lib/arch/generic/enc_head.c b/src/third-party/base64/lib/arch/generic/enc_head.c new file mode 100644 index 0000000..38d60b2 --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/enc_head.c @@ -0,0 +1,24 @@ +// Assume that *out is large enough to contain the output. +// Theoretically it should be 4/3 the length of src. +const uint8_t *s = (const uint8_t *) src; +uint8_t *o = (uint8_t *) out; + +// Use local temporaries to avoid cache thrashing: +size_t olen = 0; +size_t slen = srclen; +struct base64_state st; +st.bytes = state->bytes; +st.carry = state->carry; + +// Turn three bytes into four 6-bit numbers: +// in[0] = 00111111 +// in[1] = 00112222 +// in[2] = 00222233 +// in[3] = 00333333 + +// Duff's device, a for() loop inside a switch() statement. Legal! +switch (st.bytes) +{ + for (;;) + { + case 0: diff --git a/src/third-party/base64/lib/arch/generic/enc_tail.c b/src/third-party/base64/lib/arch/generic/enc_tail.c new file mode 100644 index 0000000..cbd5733 --- /dev/null +++ b/src/third-party/base64/lib/arch/generic/enc_tail.c @@ -0,0 +1,34 @@ + if (slen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[*s >> 2]; + st.carry = (*s++ << 4) & 0x30; + st.bytes++; + olen += 1; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 1: if (slen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[st.carry | (*s >> 4)]; + st.carry = (*s++ << 2) & 0x3C; + st.bytes++; + olen += 1; + + // Deliberate fallthrough: + BASE64_FALLTHROUGH + + case 2: if (slen-- == 0) { + break; + } + *o++ = base64_table_enc_6bit[st.carry | (*s >> 6)]; + *o++ = base64_table_enc_6bit[*s++ & 0x3F]; + st.bytes = 0; + olen += 2; + } +} +state->bytes = st.bytes; +state->carry = st.carry; +*outlen = olen; diff --git a/src/third-party/base64/lib/arch/neon32/codec.c b/src/third-party/base64/lib/arch/neon32/codec.c new file mode 100644 index 0000000..a0b27f9 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/codec.c @@ -0,0 +1,77 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#ifdef __arm__ +# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32 +# define BASE64_USE_NEON32 +# endif +#endif + +#ifdef BASE64_USE_NEON32 +#include + +// Only enable inline assembly on supported compilers. +#if defined(__GNUC__) || defined(__clang__) +#define BASE64_NEON32_USE_ASM +#endif + +static inline uint8x16_t +vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices) +{ + // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate + // the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups. + uint8x8x2_t lut2; + uint8x8x2_t result; + + lut2.val[0] = vget_low_u8(lut); + lut2.val[1] = vget_high_u8(lut); + + result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices)); + result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices)); + + return vcombine_u8(result.val[0], result.val[1]); +} + +#include "../generic/32/dec_loop.c" +#include "../generic/32/enc_loop.c" +#include "dec_loop.c" +#include "enc_reshuffle.c" +#include "enc_translate.c" +#include "enc_loop.c" + +#endif // BASE64_USE_NEON32 + +// Stride size is so large on these NEON 32-bit functions +// (48 bytes encode, 32 bytes decode) that we inline the +// uint32 codec to stay performant on smaller inputs. + +BASE64_ENC_FUNCTION(neon32) +{ +#ifdef BASE64_USE_NEON32 + #include "../generic/enc_head.c" + enc_loop_neon32(&s, &slen, &o, &olen); + enc_loop_generic_32(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(neon32) +{ +#ifdef BASE64_USE_NEON32 + #include "../generic/dec_head.c" + dec_loop_neon32(&s, &slen, &o, &olen); + dec_loop_generic_32(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/neon32/dec_loop.c b/src/third-party/base64/lib/arch/neon32/dec_loop.c new file mode 100644 index 0000000..2216b39 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/dec_loop.c @@ -0,0 +1,106 @@ +static inline int +is_nonzero (const uint8x16_t v) +{ + uint64_t u64; + const uint64x2_t v64 = vreinterpretq_u64_u8(v); + const uint32x2_t v32 = vqmovn_u64(v64); + + vst1_u64(&u64, vreinterpret_u64_u32(v32)); + return u64 != 0; +} + +static inline uint8x16_t +delta_lookup (const uint8x16_t v) +{ + const uint8x8_t lut = { + 0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71, + }; + + return vcombine_u8( + vtbl1_u8(lut, vget_low_u8(v)), + vtbl1_u8(lut, vget_high_u8(v))); +} + +static inline uint8x16_t +dec_loop_neon32_lane (uint8x16_t *lane) +{ + // See the SSSE3 decoder for an explanation of the algorithm. + const uint8x16_t lut_lo = { + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A + }; + + const uint8x16_t lut_hi = { + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 + }; + + const uint8x16_t mask_0F = vdupq_n_u8(0x0F); + const uint8x16_t mask_2F = vdupq_n_u8(0x2F); + + const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4); + const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F); + const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F); + + const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles); + const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles); + + // Now simply add the delta values to the input: + *lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles))); + + // Return the validity mask: + return vandq_u8(lo, hi); +} + +static inline void +dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 64 bytes per round. Unlike the SSE codecs, no + // extra trailing zero bytes are written, so it is not necessary to + // reserve extra input bytes: + size_t rounds = *slen / 64; + + *slen -= rounds * 64; // 64 bytes consumed per round + *olen += rounds * 48; // 48 bytes produced per round + + do { + uint8x16x3_t dec; + + // Load 64 bytes and deinterleave: + uint8x16x4_t str = vld4q_u8(*s); + + // Decode each lane, collect a mask of invalid inputs: + const uint8x16_t classified + = dec_loop_neon32_lane(&str.val[0]) + | dec_loop_neon32_lane(&str.val[1]) + | dec_loop_neon32_lane(&str.val[2]) + | dec_loop_neon32_lane(&str.val[3]); + + // Check for invalid input: if any of the delta values are + // zero, fall back on bytewise code to do error checking and + // reporting: + if (is_nonzero(classified)) { + break; + } + + // Compress four bytes into three: + dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4)); + dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2)); + dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]); + + // Interleave and store decoded result: + vst3q_u8(*o, dec); + + *s += 64; + *o += 48; + + } while (--rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 64; + *olen -= rounds * 48; +} diff --git a/src/third-party/base64/lib/arch/neon32/enc_loop.c b/src/third-party/base64/lib/arch/neon32/enc_loop.c new file mode 100644 index 0000000..e9e8e28 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/enc_loop.c @@ -0,0 +1,169 @@ +#ifdef BASE64_NEON32_USE_ASM +static inline void +enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) +{ + // This function duplicates the functionality of enc_loop_neon32_inner, + // but entirely with inline assembly. This gives a significant speedup + // over using NEON intrinsics, which do not always generate very good + // code. The logic of the assembly is directly lifted from the + // intrinsics version, so it can be used as a guide to this code. + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + uint8x16_t mask0, mask1, mask2, mask3; + + // A lookup table containing the absolute offsets for all ranges. + const uint8x16_t lut = { + 65U, 71U, 252U, 252U, + 252U, 252U, 252U, 252U, + 252U, 252U, 252U, 252U, + 237U, 240U, 0U, 0U + }; + + // Numeric constants. + const uint8x16_t n51 = vdupq_n_u8(51); + const uint8x16_t n25 = vdupq_n_u8(25); + const uint8x16_t n63 = vdupq_n_u8(63); + + __asm__ ( + + // Load 48 bytes and deinterleave. The bytes are loaded to + // hard-coded registers q12, q13 and q14, to ensure that they + // are contiguous. Increment the source pointer. + "vld3.8 {d24, d26, d28}, [%[src]]! \n\t" + "vld3.8 {d25, d27, d29}, [%[src]]! \n\t" + + // Reshuffle the bytes using temporaries. + "vshr.u8 %q[t0], q12, #2 \n\t" + "vshr.u8 %q[t1], q13, #4 \n\t" + "vshr.u8 %q[t2], q14, #6 \n\t" + "vsli.8 %q[t1], q12, #4 \n\t" + "vsli.8 %q[t2], q13, #2 \n\t" + "vand.u8 %q[t1], %q[t1], %q[n63] \n\t" + "vand.u8 %q[t2], %q[t2], %q[n63] \n\t" + "vand.u8 %q[t3], q14, %q[n63] \n\t" + + // t0..t3 are the reshuffled inputs. Create LUT indices. + "vqsub.u8 q12, %q[t0], %q[n51] \n\t" + "vqsub.u8 q13, %q[t1], %q[n51] \n\t" + "vqsub.u8 q14, %q[t2], %q[n51] \n\t" + "vqsub.u8 q15, %q[t3], %q[n51] \n\t" + + // Create the mask for range #0. + "vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t" + "vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t" + "vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t" + "vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t" + + // Subtract -1 to correct the LUT indices. + "vsub.u8 q12, %q[m0] \n\t" + "vsub.u8 q13, %q[m1] \n\t" + "vsub.u8 q14, %q[m2] \n\t" + "vsub.u8 q15, %q[m3] \n\t" + + // Lookup the delta values. + "vtbl.u8 d24, {%q[lut]}, d24 \n\t" + "vtbl.u8 d25, {%q[lut]}, d25 \n\t" + "vtbl.u8 d26, {%q[lut]}, d26 \n\t" + "vtbl.u8 d27, {%q[lut]}, d27 \n\t" + "vtbl.u8 d28, {%q[lut]}, d28 \n\t" + "vtbl.u8 d29, {%q[lut]}, d29 \n\t" + "vtbl.u8 d30, {%q[lut]}, d30 \n\t" + "vtbl.u8 d31, {%q[lut]}, d31 \n\t" + + // Add the delta values. + "vadd.u8 q12, %q[t0] \n\t" + "vadd.u8 q13, %q[t1] \n\t" + "vadd.u8 q14, %q[t2] \n\t" + "vadd.u8 q15, %q[t3] \n\t" + + // Store 64 bytes and interleave. Increment the dest pointer. + "vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t" + "vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t" + + // Outputs (modified). + : [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3), + [m0] "=&w" (mask0), + [m1] "=&w" (mask1), + [m2] "=&w" (mask2), + [m3] "=&w" (mask3) + + // Inputs (not modified). + : [lut] "w" (lut), + [n25] "w" (n25), + [n51] "w" (n51), + [n63] "w" (n63) + + // Clobbers. + : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" + ); +} +#endif + +static inline void +enc_loop_neon32_inner (const uint8_t **s, uint8_t **o) +{ +#ifdef BASE64_NEON32_USE_ASM + enc_loop_neon32_inner_asm(s, o); +#else + // Load 48 bytes and deinterleave: + uint8x16x3_t src = vld3q_u8(*s); + + // Reshuffle: + uint8x16x4_t out = enc_reshuffle(src); + + // Translate reshuffled bytes to the Base64 alphabet: + out = enc_translate(out); + + // Interleave and store output: + vst4q_u8(*o, out); + + *s += 48; + *o += 64; +#endif +} + +static inline void +enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_neon32_inner(s, o); + break; + } +} diff --git a/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c b/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c new file mode 100644 index 0000000..d6e97cb --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c @@ -0,0 +1,31 @@ +static inline uint8x16x4_t +enc_reshuffle (uint8x16x3_t in) +{ + uint8x16x4_t out; + + // Input: + // in[0] = a7 a6 a5 a4 a3 a2 a1 a0 + // in[1] = b7 b6 b5 b4 b3 b2 b1 b0 + // in[2] = c7 c6 c5 c4 c3 c2 c1 c0 + + // Output: + // out[0] = 00 00 a7 a6 a5 a4 a3 a2 + // out[1] = 00 00 a1 a0 b7 b6 b5 b4 + // out[2] = 00 00 b3 b2 b1 b0 c7 c6 + // out[3] = 00 00 c5 c4 c3 c2 c1 c0 + + // Move the input bits to where they need to be in the outputs. Except + // for the first output, the high two bits are not cleared. + out.val[0] = vshrq_n_u8(in.val[0], 2); + out.val[1] = vshrq_n_u8(in.val[1], 4); + out.val[2] = vshrq_n_u8(in.val[2], 6); + out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4); + out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2); + + // Clear the high two bits in the second, third and fourth output. + out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); + out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); + out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F)); + + return out; +} diff --git a/src/third-party/base64/lib/arch/neon32/enc_translate.c b/src/third-party/base64/lib/arch/neon32/enc_translate.c new file mode 100644 index 0000000..e616d54 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/enc_translate.c @@ -0,0 +1,57 @@ +static inline uint8x16x4_t +enc_translate (const uint8x16x4_t in) +{ + // A lookup table containing the absolute offsets for all ranges: + const uint8x16_t lut = { + 65U, 71U, 252U, 252U, + 252U, 252U, 252U, 252U, + 252U, 252U, 252U, 252U, + 237U, 240U, 0U, 0U + }; + + const uint8x16_t offset = vdupq_n_u8(51); + + uint8x16x4_t indices, mask, delta, out; + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from input: + // the index for range #0 is right, others are 1 less than expected: + indices.val[0] = vqsubq_u8(in.val[0], offset); + indices.val[1] = vqsubq_u8(in.val[1], offset); + indices.val[2] = vqsubq_u8(in.val[2], offset); + indices.val[3] = vqsubq_u8(in.val[3], offset); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25)); + mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25)); + mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25)); + mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25)); + + // Subtract -1, so add 1 to indices for range #[1..4], All indices are + // now correct: + indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]); + indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]); + indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]); + indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]); + + // Lookup delta values: + delta.val[0] = vqtbl1q_u8(lut, indices.val[0]); + delta.val[1] = vqtbl1q_u8(lut, indices.val[1]); + delta.val[2] = vqtbl1q_u8(lut, indices.val[2]); + delta.val[3] = vqtbl1q_u8(lut, indices.val[3]); + + // Add delta values: + out.val[0] = vaddq_u8(in.val[0], delta.val[0]); + out.val[1] = vaddq_u8(in.val[1], delta.val[1]); + out.val[2] = vaddq_u8(in.val[2], delta.val[2]); + out.val[3] = vaddq_u8(in.val[3], delta.val[3]); + + return out; +} diff --git a/src/third-party/base64/lib/arch/neon64/codec.c b/src/third-party/base64/lib/arch/neon64/codec.c new file mode 100644 index 0000000..fc953b2 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/codec.c @@ -0,0 +1,92 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#ifdef __aarch64__ +# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64 +# define BASE64_USE_NEON64 +# endif +#endif + +#ifdef BASE64_USE_NEON64 +#include + +// Only enable inline assembly on supported compilers. +#if defined(__GNUC__) || defined(__clang__) +#define BASE64_NEON64_USE_ASM +#endif + +static inline uint8x16x4_t +load_64byte_table (const uint8_t *p) +{ +#ifdef BASE64_NEON64_USE_ASM + + // Force the table to be loaded into contiguous registers. GCC will not + // normally allocate contiguous registers for a `uint8x16x4_t'. These + // registers are chosen to not conflict with the ones in the enc loop. + register uint8x16_t t0 __asm__ ("v8"); + register uint8x16_t t1 __asm__ ("v9"); + register uint8x16_t t2 __asm__ ("v10"); + register uint8x16_t t3 __asm__ ("v11"); + + __asm__ ( + "ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t" + : [src] "+r" (p), + [t0] "=w" (t0), + [t1] "=w" (t1), + [t2] "=w" (t2), + [t3] "=w" (t3) + ); + + return (uint8x16x4_t) { + .val[0] = t0, + .val[1] = t1, + .val[2] = t2, + .val[3] = t3, + }; +#else + return vld1q_u8_x4(p); +#endif +} + +#include "../generic/32/dec_loop.c" +#include "../generic/64/enc_loop.c" +#include "dec_loop.c" +#include "enc_reshuffle.c" +#include "enc_loop.c" + +#endif // BASE64_USE_NEON64 + +// Stride size is so large on these NEON 64-bit functions +// (48 bytes encode, 64 bytes decode) that we inline the +// uint64 codec to stay performant on smaller inputs. + +BASE64_ENC_FUNCTION(neon64) +{ +#ifdef BASE64_USE_NEON64 + #include "../generic/enc_head.c" + enc_loop_neon64(&s, &slen, &o, &olen); + enc_loop_generic_64(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(neon64) +{ +#ifdef BASE64_USE_NEON64 + #include "../generic/dec_head.c" + dec_loop_neon64(&s, &slen, &o, &olen); + dec_loop_generic_32(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/neon64/dec_loop.c b/src/third-party/base64/lib/arch/neon64/dec_loop.c new file mode 100644 index 0000000..48232f2 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/dec_loop.c @@ -0,0 +1,129 @@ +// The input consists of five valid character sets in the Base64 alphabet, +// which we need to map back to the 6-bit values they represent. +// There are three ranges, two singles, and then there's the rest. +// +// # From To LUT Characters +// 1 [0..42] [255] #1 invalid input +// 2 [43] [62] #1 + +// 3 [44..46] [255] #1 invalid input +// 4 [47] [63] #1 / +// 5 [48..57] [52..61] #1 0..9 +// 6 [58..63] [255] #1 invalid input +// 7 [64] [255] #2 invalid input +// 8 [65..90] [0..25] #2 A..Z +// 9 [91..96] [255] #2 invalid input +// 10 [97..122] [26..51] #2 a..z +// 11 [123..126] [255] #2 invalid input +// (12) Everything else => invalid input + +// The first LUT will use the VTBL instruction (out of range indices are set to +// 0 in destination). +static const uint8_t dec_lut1[] = { + 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, + 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, + 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U, + 52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U, +}; + +// The second LUT will use the VTBX instruction (out of range indices will be +// unchanged in destination). Input [64..126] will be mapped to index [1..63] +// in this LUT. Index 0 means that value comes from LUT #1. +static const uint8_t dec_lut2[] = { + 0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U, + 14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U, + 255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U, + 40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U, +}; + +// All input values in range for the first look-up will be 0U in the second +// look-up result. All input values out of range for the first look-up will be +// 0U in the first look-up result. Thus, the two results can be ORed without +// conflicts. +// +// Invalid characters that are in the valid range for either look-up will be +// set to 255U in the combined result. Other invalid characters will just be +// passed through with the second look-up result (using the VTBX instruction). +// Since the second LUT is 64 bytes, those passed-through values are guaranteed +// to have a value greater than 63U. Therefore, valid characters will be mapped +// to the valid [0..63] range and all invalid characters will be mapped to +// values greater than 63. + +static inline void +dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 64 bytes per round. Unlike the SSE codecs, no + // extra trailing zero bytes are written, so it is not necessary to + // reserve extra input bytes: + size_t rounds = *slen / 64; + + *slen -= rounds * 64; // 64 bytes consumed per round + *olen += rounds * 48; // 48 bytes produced per round + + const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1); + const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2); + + do { + const uint8x16_t offset = vdupq_n_u8(63U); + uint8x16x4_t dec1, dec2; + uint8x16x3_t dec; + + // Load 64 bytes and deinterleave: + uint8x16x4_t str = vld4q_u8((uint8_t *) *s); + + // Get indices for second LUT: + dec2.val[0] = vqsubq_u8(str.val[0], offset); + dec2.val[1] = vqsubq_u8(str.val[1], offset); + dec2.val[2] = vqsubq_u8(str.val[2], offset); + dec2.val[3] = vqsubq_u8(str.val[3], offset); + + // Get values from first LUT: + dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]); + dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]); + dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]); + dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]); + + // Get values from second LUT: + dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]); + dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]); + dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]); + dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]); + + // Get final values: + str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]); + str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]); + str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]); + str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]); + + // Check for invalid input, any value larger than 63: + const uint8x16_t classified + = vcgtq_u8(str.val[0], vdupq_n_u8(63)) + | vcgtq_u8(str.val[1], vdupq_n_u8(63)) + | vcgtq_u8(str.val[2], vdupq_n_u8(63)) + | vcgtq_u8(str.val[3], vdupq_n_u8(63)); + + // Check that all bits are zero: + if (vmaxvq_u8(classified) != 0U) { + break; + } + + // Compress four bytes into three: + dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4); + dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2); + dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3]; + + // Interleave and store decoded result: + vst3q_u8((uint8_t *) *o, dec); + + *s += 64; + *o += 48; + + } while (--rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 64; + *olen -= rounds * 48; +} diff --git a/src/third-party/base64/lib/arch/neon64/enc_loop.c b/src/third-party/base64/lib/arch/neon64/enc_loop.c new file mode 100644 index 0000000..d1862f7 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/enc_loop.c @@ -0,0 +1,133 @@ +#ifdef BASE64_NEON64_USE_ASM +static inline void +enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) +{ + // This function duplicates the functionality of enc_loop_neon64_inner, + // but entirely with inline assembly. This gives a significant speedup + // over using NEON intrinsics, which do not always generate very good + // code. The logic of the assembly is directly lifted from the + // intrinsics version, so it can be used as a guide to this code. + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + + // Numeric constant. + const uint8x16_t n63 = vdupq_n_u8(63); + + __asm__ ( + + // Load 48 bytes and deinterleave. The bytes are loaded to + // hard-coded registers v12, v13 and v14, to ensure that they + // are contiguous. Increment the source pointer. + "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" + + // Reshuffle the bytes using temporaries. + "ushr %[t0].16b, v12.16b, #2 \n\t" + "ushr %[t1].16b, v13.16b, #4 \n\t" + "ushr %[t2].16b, v14.16b, #6 \n\t" + "sli %[t1].16b, v12.16b, #4 \n\t" + "sli %[t2].16b, v13.16b, #2 \n\t" + "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" + "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" + "and %[t3].16b, v14.16b, %[n63].16b \n\t" + + // Translate the values to the Base64 alphabet. + "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" + "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" + "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" + "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" + + // Store 64 bytes and interleave. Increment the dest pointer. + "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" + + // Outputs (modified). + : [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3) + + // Inputs (not modified). + : [n63] "w" (n63), + [l0] "w" (tbl_enc.val[0]), + [l1] "w" (tbl_enc.val[1]), + [l2] "w" (tbl_enc.val[2]), + [l3] "w" (tbl_enc.val[3]) + + // Clobbers. + : "v12", "v13", "v14", "v15" + ); +} +#endif + +static inline void +enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) +{ +#ifdef BASE64_NEON64_USE_ASM + enc_loop_neon64_inner_asm(s, o, tbl_enc); +#else + // Load 48 bytes and deinterleave: + uint8x16x3_t src = vld3q_u8(*s); + + // Divide bits of three input bytes over four output bytes: + uint8x16x4_t out = enc_reshuffle(src); + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]); + out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]); + out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]); + out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]); + + // Interleave and store output: + vst4q_u8(*o, out); + + *s += 48; + *o += 64; +#endif +} + +static inline void +enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + // Load the encoding table: + const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit); + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 2; + continue; + } + enc_loop_neon64_inner(s, o, tbl_enc); + break; + } +} diff --git a/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c b/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c new file mode 100644 index 0000000..ea543e0 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c @@ -0,0 +1,31 @@ +static inline uint8x16x4_t +enc_reshuffle (const uint8x16x3_t in) +{ + uint8x16x4_t out; + + // Input: + // in[0] = a7 a6 a5 a4 a3 a2 a1 a0 + // in[1] = b7 b6 b5 b4 b3 b2 b1 b0 + // in[2] = c7 c6 c5 c4 c3 c2 c1 c0 + + // Output: + // out[0] = 00 00 a7 a6 a5 a4 a3 a2 + // out[1] = 00 00 a1 a0 b7 b6 b5 b4 + // out[2] = 00 00 b3 b2 b1 b0 c7 c6 + // out[3] = 00 00 c5 c4 c3 c2 c1 c0 + + // Move the input bits to where they need to be in the outputs. Except + // for the first output, the high two bits are not cleared. + out.val[0] = vshrq_n_u8(in.val[0], 2); + out.val[1] = vshrq_n_u8(in.val[1], 4); + out.val[2] = vshrq_n_u8(in.val[2], 6); + out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4); + out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2); + + // Clear the high two bits in the second, third and fourth output. + out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); + out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); + out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F)); + + return out; +} diff --git a/src/third-party/base64/lib/arch/sse41/codec.c b/src/third-party/base64/lib/arch/sse41/codec.c new file mode 100644 index 0000000..00645fe --- /dev/null +++ b/src/third-party/base64/lib/arch/sse41/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_SSE41 +#include + +#include "../ssse3/dec_reshuffle.c" +#include "../ssse3/dec_loop.c" +#include "../ssse3/enc_translate.c" +#include "../ssse3/enc_reshuffle.c" +#include "../ssse3/enc_loop.c" + +#endif // HAVE_SSE41 + +BASE64_ENC_FUNCTION(sse41) +{ +#if HAVE_SSE41 + #include "../generic/enc_head.c" + enc_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(sse41) +{ +#if HAVE_SSE41 + #include "../generic/dec_head.c" + dec_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/sse42/codec.c b/src/third-party/base64/lib/arch/sse42/codec.c new file mode 100644 index 0000000..cf5d97c --- /dev/null +++ b/src/third-party/base64/lib/arch/sse42/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_SSE42 +#include + +#include "../ssse3/dec_reshuffle.c" +#include "../ssse3/dec_loop.c" +#include "../ssse3/enc_translate.c" +#include "../ssse3/enc_reshuffle.c" +#include "../ssse3/enc_loop.c" + +#endif // HAVE_SSE42 + +BASE64_ENC_FUNCTION(sse42) +{ +#if HAVE_SSE42 + #include "../generic/enc_head.c" + enc_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(sse42) +{ +#if HAVE_SSE42 + #include "../generic/dec_head.c" + dec_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/ssse3/codec.c b/src/third-party/base64/lib/arch/ssse3/codec.c new file mode 100644 index 0000000..ad14a45 --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/codec.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_SSSE3 +#include + +#include "dec_reshuffle.c" +#include "dec_loop.c" +#include "enc_reshuffle.c" +#include "enc_translate.c" +#include "enc_loop.c" + +#endif // HAVE_SSSE3 + +BASE64_ENC_FUNCTION(ssse3) +{ +#if HAVE_SSSE3 + #include "../generic/enc_head.c" + enc_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(ssse3) +{ +#if HAVE_SSSE3 + #include "../generic/dec_head.c" + dec_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/ssse3/dec_loop.c b/src/third-party/base64/lib/arch/ssse3/dec_loop.c new file mode 100644 index 0000000..9da71ab --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/dec_loop.c @@ -0,0 +1,173 @@ +// The input consists of six character sets in the Base64 alphabet, which we +// need to map back to the 6-bit values they represent. There are three ranges, +// two singles, and then there's the rest. +// +// # From To Add Characters +// 1 [43] [62] +19 + +// 2 [47] [63] +16 / +// 3 [48..57] [52..61] +4 0..9 +// 4 [65..90] [0..25] -65 A..Z +// 5 [97..122] [26..51] -71 a..z +// (6) Everything else => invalid input +// +// We will use lookup tables for character validation and offset computation. +// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this +// allows to mask with 0x2F instead of 0x0F and thus save one constant +// declaration (register and/or memory access). +// +// For offsets: +// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00) +// 0000 = garbage +// 0001 = / +// 0010 = + +// 0011 = 0-9 +// 0100 = A-Z +// 0101 = A-Z +// 0110 = a-z +// 0111 = a-z +// 1000 >= garbage +// +// For validation, here's the table. +// A character is valid if and only if the AND of the 2 lookups equals 0: +// +// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 +// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A +// +// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI +// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// +// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US +// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// +// 0010 0x01 char ! " # $ % & ' ( ) * + , - . / +// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00 +// +// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? +// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 +// +// 0100 0x04 char @ A B C D E F G H I J K L M N O +// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +// +// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ +// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 +// +// 0110 0x04 char ` a b c d e f g h i j k l m n o +// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +// 0111 0x08 char p q r s t u v w x y z { | } ~ +// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 +// +// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + +static inline int +dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const __m128i lut_lo = _mm_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m128i lut_hi = _mm_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m128i lut_roll = _mm_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m128i mask_2F = _mm_set1_epi8(0x2F); + + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); + + // Table lookups: + const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F); + const __m128i lo_nibbles = _mm_and_si128(str, mask_2F); + const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); + const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); + + // Check for invalid input: if any "and" values from lo and hi are not + // zero, fall back on bytewise code to do error checking and reporting: + if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) { + return 0; + } + + const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F); + const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm_add_epi8(str, roll); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store the output: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 16; + *o += 12; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 24) { + return; + } + + // Process blocks of 16 bytes per round. Because 4 extra zero bytes are + // written after the output, ensure that there will be at least 8 bytes + // of input data left to cover the gap. (6 data bytes and up to two + // end-of-string markers.) + size_t rounds = (*slen - 8) / 16; + + *slen -= rounds * 16; // 16 bytes consumed per round + *olen += rounds * 12; // 12 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_ssse3_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 16; + *olen -= rounds * 12; +} diff --git a/src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c b/src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c new file mode 100644 index 0000000..fdf587f --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c @@ -0,0 +1,33 @@ +static inline __m128i +dec_reshuffle (const __m128i in) +{ + // in, bits, upper case are most significant bits, lower case are least significant bits + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together: + return _mm_shuffle_epi8(out, _mm_setr_epi8( + 2, 1, 0, + 6, 5, 4, + 10, 9, 8, + 14, 13, 12, + -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa +} diff --git a/src/third-party/base64/lib/arch/ssse3/enc_loop.c b/src/third-party/base64/lib/arch/ssse3/enc_loop.c new file mode 100644 index 0000000..6de652e --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/enc_loop.c @@ -0,0 +1,67 @@ +static inline void +enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); + + // Reshuffle: + str = enc_reshuffle(str); + + // Translate reshuffled bytes to the Base64 alphabet: + str = enc_translate(str); + + // Store: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 12; + *o += 16; +} + +static inline void +enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 16) { + return; + } + + // Process blocks of 12 bytes at a time. Because blocks are loaded 16 + // bytes at a time, ensure that there will be at least 4 remaining + // bytes after the last round, so that the final read will not pass + // beyond the bounds of the input buffer: + size_t rounds = (*slen - 4) / 12; + + *slen -= rounds * 12; // 12 bytes consumed per round + *olen += rounds * 16; // 16 bytes produced per round + + do { + if (rounds >= 8) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_ssse3_inner(s, o); + break; + + } while (rounds > 0); +} diff --git a/src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c b/src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c new file mode 100644 index 0000000..b738591 --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c @@ -0,0 +1,48 @@ +static inline __m128i +enc_reshuffle (__m128i in) +{ + // Input, bytes MSB to LSB: + // 0 0 0 0 l k j i h g f e d c b a + + in = _mm_shuffle_epi8(in, _mm_set_epi8( + 10, 11, 9, 10, + 7, 8, 6, 7, + 4, 5, 3, 4, + 1, 2, 0, 1)); + // in, bytes MSB to LSB: + // k l j k + // h i g h + // e f d e + // b c a b + + const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00)); + // bits, upper case are most significant bits, lower case are least significant bits + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0)); + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + + return _mm_or_si128(t1, t3); + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA +} diff --git a/src/third-party/base64/lib/arch/ssse3/enc_translate.c b/src/third-party/base64/lib/arch/ssse3/enc_translate.c new file mode 100644 index 0000000..04f288f --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/enc_translate.c @@ -0,0 +1,33 @@ +static inline __m128i +enc_translate (const __m128i in) +{ + // A lookup table containing the absolute offsets for all ranges: + const __m128i lut = _mm_setr_epi8( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ); + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from the input. The index for range #0 is right, + // others are 1 less than expected: + __m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51)); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + __m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25)); + + // Subtract -1, so add 1 to indices for range #[1..4]. All indices are + // now correct: + indices = _mm_sub_epi8(indices, mask); + + // Add offsets to input values: + return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices)); +} -- cgit v1.2.3