diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 20:01:36 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 20:01:36 +0000 |
commit | 62e4c68907d8d33709c2c1f92a161dff00b3d5f2 (patch) | |
tree | adbbaf3acf88ea08f6eeec4b75ee98ad3b07fbdc /src/third-party/base64/lib/arch/avx2 | |
parent | Initial commit. (diff) | |
download | lnav-62e4c68907d8d33709c2c1f92a161dff00b3d5f2.tar.xz lnav-62e4c68907d8d33709c2c1f92a161dff00b3d5f2.zip |
Adding upstream version 0.11.2.upstream/0.11.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/third-party/base64/lib/arch/avx2')
-rw-r--r-- | src/third-party/base64/lib/arch/avx2/codec.c | 42 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/avx2/dec_loop.c | 110 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/avx2/dec_reshuffle.c | 34 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/avx2/enc_loop.c | 89 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/avx2/enc_reshuffle.c | 83 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/avx2/enc_translate.c | 30 |
6 files changed, 388 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/avx2/codec.c b/src/third-party/base64/lib/arch/avx2/codec.c new file mode 100644 index 0000000..0498548 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/codec.c @@ -0,0 +1,42 @@ +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_AVX2 +#include <immintrin.h> + +#include "dec_reshuffle.c" +#include "dec_loop.c" +#include "enc_translate.c" +#include "enc_reshuffle.c" +#include "enc_loop.c" + +#endif // HAVE_AVX2 + +BASE64_ENC_FUNCTION(avx2) +{ +#if HAVE_AVX2 + #include "../generic/enc_head.c" + enc_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(avx2) +{ +#if HAVE_AVX2 + #include "../generic/dec_head.c" + dec_loop_avx2(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/avx2/dec_loop.c b/src/third-party/base64/lib/arch/avx2/dec_loop.c new file mode 100644 index 0000000..f959fc4 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/dec_loop.c @@ -0,0 +1,110 @@ +static inline int +dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const __m256i lut_lo = _mm256_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m256i lut_hi = _mm256_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m256i lut_roll = _mm256_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m256i mask_2F = _mm256_set1_epi8(0x2F); + + // Load input: + __m256i str = _mm256_loadu_si256((__m256i *) *s); + + // See the SSSE3 decoder for an explanation of the algorithm. + const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); + const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); + const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); + const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); + + if (!_mm256_testz_si256(lo, hi)) { + return 0; + } + + const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); + const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm256_add_epi8(str, roll); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store the output: + _mm256_storeu_si256((__m256i *) *o, str); + + *s += 32; + *o += 24; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 45) { + return; + } + + // Process blocks of 32 bytes per round. Because 8 extra zero bytes are + // written after the output, ensure that there will be at least 13 + // bytes of input data left to cover the gap. (11 data bytes and up to + // two end-of-string markers.) + size_t rounds = (*slen - 13) / 32; + + *slen -= rounds * 32; // 32 bytes consumed per round + *olen += rounds * 24; // 24 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_avx2_inner(s, o, &rounds) && + dec_loop_avx2_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_avx2_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 32; + *olen -= rounds * 24; +} diff --git a/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c new file mode 100644 index 0000000..f351809 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c @@ -0,0 +1,34 @@ +static inline __m256i +dec_reshuffle (const __m256i in) +{ + // in, lower lane, bits, upper case are most significant bits, lower + // case are least significant bits: + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together in each lane: + out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, + 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa + + // Pack lanes: + return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); +} diff --git a/src/third-party/base64/lib/arch/avx2/enc_loop.c b/src/third-party/base64/lib/arch/avx2/enc_loop.c new file mode 100644 index 0000000..b9e2736 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/enc_loop.c @@ -0,0 +1,89 @@ +static inline void +enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) +{ + // First load is done at s - 0 to not get a segfault: + __m256i src = _mm256_loadu_si256((__m256i *) *s); + + // Shift by 4 bytes, as required by enc_reshuffle: + src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6)); + + // Reshuffle, translate, store: + src = enc_reshuffle(src); + src = enc_translate(src); + _mm256_storeu_si256((__m256i *) *o, src); + + // Subsequent loads will be done at s - 4, set pointer for next round: + *s += 20; + *o += 32; +} + +static inline void +enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m256i src = _mm256_loadu_si256((__m256i *) *s); + + // Reshuffle, translate, store: + src = enc_reshuffle(src); + src = enc_translate(src); + _mm256_storeu_si256((__m256i *) *o, src); + + *s += 24; + *o += 32; +} + +static inline void +enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 32) { + return; + } + + // Process blocks of 24 bytes at a time. Because blocks are loaded 32 + // bytes at a time an offset of -4, ensure that there will be at least + // 4 remaining bytes after the last round, so that the final read will + // not pass beyond the bounds of the input buffer: + size_t rounds = (*slen - 4) / 24; + + *slen -= rounds * 24; // 24 bytes consumed per round + *olen += rounds * 32; // 32 bytes produced per round + + // The first loop iteration requires special handling to ensure that + // the read, which is done at an offset, does not underflow the buffer: + enc_loop_avx2_inner_first(s, o); + rounds--; + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_avx2_inner(s, o); + enc_loop_avx2_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_avx2_inner(s, o); + break; + } + + // Add the offset back: + *s += 4; +} diff --git a/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c new file mode 100644 index 0000000..ba16690 --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c @@ -0,0 +1,83 @@ +static inline __m256i +enc_reshuffle (const __m256i input) +{ + // Translation of the SSSE3 reshuffling algorithm to AVX2. This one + // works with shifted (4 bytes) input in order to be able to work + // efficiently in the two 128-bit lanes. + + // Input, bytes MSB to LSB: + // 0 0 0 0 x w v u t s r q p o n m + // l k j i h g f e d c b a 0 0 0 0 + + const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8( + 10, 11, 9, 10, + 7, 8, 6, 7, + 4, 5, 3, 4, + 1, 2, 0, 1, + + 14, 15, 13, 14, + 11, 12, 10, 11, + 8, 9, 7, 8, + 5, 6, 4, 5)); + // in, bytes MSB to LSB: + // w x v w + // t u s t + // q r p q + // n o m n + // k l j k + // h i g h + // e f d e + // b c a b + + const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00)); + // bits, upper case are most significant bits, lower case are least + // significant bits. + // 0000wwww XX000000 VVVVVV00 00000000 + // 0000tttt UU000000 SSSSSS00 00000000 + // 0000qqqq RR000000 PPPPPP00 00000000 + // 0000nnnn OO000000 MMMMMM00 00000000 + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); + // 00000000 00wwwwXX 00000000 00VVVVVV + // 00000000 00ttttUU 00000000 00SSSSSS + // 00000000 00qqqqRR 00000000 00PPPPPP + // 00000000 00nnnnOO 00000000 00MMMMMM + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0)); + // 00000000 00xxxxxx 000000vv WWWW0000 + // 00000000 00uuuuuu 000000ss TTTT0000 + // 00000000 00rrrrrr 000000pp QQQQ0000 + // 00000000 00oooooo 000000mm NNNN0000 + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); + // 00xxxxxx 00000000 00vvWWWW 00000000 + // 00uuuuuu 00000000 00ssTTTT 00000000 + // 00rrrrrr 00000000 00ppQQQQ 00000000 + // 00oooooo 00000000 00mmNNNN 00000000 + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + + return _mm256_or_si256(t1, t3); + // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV + // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS + // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP + // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA +} diff --git a/src/third-party/base64/lib/arch/avx2/enc_translate.c b/src/third-party/base64/lib/arch/avx2/enc_translate.c new file mode 100644 index 0000000..46173cd --- /dev/null +++ b/src/third-party/base64/lib/arch/avx2/enc_translate.c @@ -0,0 +1,30 @@ +static inline __m256i +enc_translate (const __m256i in) +{ + // A lookup table containing the absolute offsets for all ranges: + const __m256i lut = _mm256_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from the input. The index for range #0 is right, + // others are 1 less than expected: + __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25)); + + // Subtract -1, so add 1 to indices for range #[1..4]. All indices are + // now correct: + indices = _mm256_sub_epi8(indices, mask); + + // Add offsets to input values: + return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); +} |