diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 20:01:36 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 20:01:36 +0000 |
commit | 62e4c68907d8d33709c2c1f92a161dff00b3d5f2 (patch) | |
tree | adbbaf3acf88ea08f6eeec4b75ee98ad3b07fbdc /src/third-party/base64/lib/arch/ssse3 | |
parent | Initial commit. (diff) | |
download | lnav-62e4c68907d8d33709c2c1f92a161dff00b3d5f2.tar.xz lnav-62e4c68907d8d33709c2c1f92a161dff00b3d5f2.zip |
Adding upstream version 0.11.2.upstream/0.11.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/third-party/base64/lib/arch/ssse3')
-rw-r--r-- | src/third-party/base64/lib/arch/ssse3/codec.c | 42 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/ssse3/dec_loop.c | 173 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c | 33 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/ssse3/enc_loop.c | 67 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c | 48 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/ssse3/enc_translate.c | 33 |
6 files changed, 396 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/ssse3/codec.c b/src/third-party/base64/lib/arch/ssse3/codec.c new file mode 100644 index 0000000..ad14a45 --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/codec.c @@ -0,0 +1,42 @@ +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#if HAVE_SSSE3 +#include <tmmintrin.h> + +#include "dec_reshuffle.c" +#include "dec_loop.c" +#include "enc_reshuffle.c" +#include "enc_translate.c" +#include "enc_loop.c" + +#endif // HAVE_SSSE3 + +BASE64_ENC_FUNCTION(ssse3) +{ +#if HAVE_SSSE3 + #include "../generic/enc_head.c" + enc_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(ssse3) +{ +#if HAVE_SSSE3 + #include "../generic/dec_head.c" + dec_loop_ssse3(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/ssse3/dec_loop.c b/src/third-party/base64/lib/arch/ssse3/dec_loop.c new file mode 100644 index 0000000..9da71ab --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/dec_loop.c @@ -0,0 +1,173 @@ +// The input consists of six character sets in the Base64 alphabet, which we +// need to map back to the 6-bit values they represent. There are three ranges, +// two singles, and then there's the rest. +// +// # From To Add Characters +// 1 [43] [62] +19 + +// 2 [47] [63] +16 / +// 3 [48..57] [52..61] +4 0..9 +// 4 [65..90] [0..25] -65 A..Z +// 5 [97..122] [26..51] -71 a..z +// (6) Everything else => invalid input +// +// We will use lookup tables for character validation and offset computation. +// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this +// allows to mask with 0x2F instead of 0x0F and thus save one constant +// declaration (register and/or memory access). +// +// For offsets: +// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00) +// 0000 = garbage +// 0001 = / +// 0010 = + +// 0011 = 0-9 +// 0100 = A-Z +// 0101 = A-Z +// 0110 = a-z +// 0111 = a-z +// 1000 >= garbage +// +// For validation, here's the table. +// A character is valid if and only if the AND of the 2 lookups equals 0: +// +// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 +// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A +// +// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI +// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// +// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US +// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// +// 0010 0x01 char ! " # $ % & ' ( ) * + , - . / +// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00 +// +// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ? +// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02 +// +// 0100 0x04 char @ A B C D E F G H I J K L M N O +// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +// +// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _ +// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 +// +// 0110 0x04 char ` a b c d e f g h i j k l m n o +// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +// 0111 0x08 char p q r s t u v w x y z { | } ~ +// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08 +// +// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 +// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 + +static inline int +dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds) +{ + const __m128i lut_lo = _mm_setr_epi8( + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); + + const __m128i lut_hi = _mm_setr_epi8( + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); + + const __m128i lut_roll = _mm_setr_epi8( + 0, 16, 19, 4, -65, -65, -71, -71, + 0, 0, 0, 0, 0, 0, 0, 0); + + const __m128i mask_2F = _mm_set1_epi8(0x2F); + + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); + + // Table lookups: + const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F); + const __m128i lo_nibbles = _mm_and_si128(str, mask_2F); + const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles); + const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles); + + // Check for invalid input: if any "and" values from lo and hi are not + // zero, fall back on bytewise code to do error checking and reporting: + if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) { + return 0; + } + + const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F); + const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles)); + + // Now simply add the delta values to the input: + str = _mm_add_epi8(str, roll); + + // Reshuffle the input to packed 12-byte output format: + str = dec_reshuffle(str); + + // Store the output: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 16; + *o += 12; + *rounds -= 1; + + return 1; +} + +static inline void +dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 24) { + return; + } + + // Process blocks of 16 bytes per round. Because 4 extra zero bytes are + // written after the output, ensure that there will be at least 8 bytes + // of input data left to cover the gap. (6 data bytes and up to two + // end-of-string markers.) + size_t rounds = (*slen - 8) / 16; + + *slen -= rounds * 16; // 16 bytes consumed per round + *olen += rounds * 12; // 12 bytes produced per round + + do { + if (rounds >= 8) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 4) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + if (rounds >= 2) { + if (dec_loop_ssse3_inner(s, o, &rounds) && + dec_loop_ssse3_inner(s, o, &rounds)) { + continue; + } + break; + } + dec_loop_ssse3_inner(s, o, &rounds); + break; + + } while (rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 16; + *olen -= rounds * 12; +} diff --git a/src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c b/src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c new file mode 100644 index 0000000..fdf587f --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/dec_reshuffle.c @@ -0,0 +1,33 @@ +static inline __m128i +dec_reshuffle (const __m128i in) +{ + // in, bits, upper case are most significant bits, lower case are least significant bits + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA + + const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140)); + // 0000kkkk LLllllll 0000JJJJ JJjjKKKK + // 0000hhhh IIiiiiii 0000GGGG GGggHHHH + // 0000eeee FFffffff 0000DDDD DDddEEEE + // 0000bbbb CCcccccc 0000AAAA AAaaBBBB + + const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000)); + // 00000000 JJJJJJjj KKKKkkkk LLllllll + // 00000000 GGGGGGgg HHHHhhhh IIiiiiii + // 00000000 DDDDDDdd EEEEeeee FFffffff + // 00000000 AAAAAAaa BBBBbbbb CCcccccc + + // Pack bytes together: + return _mm_shuffle_epi8(out, _mm_setr_epi8( + 2, 1, 0, + 6, 5, 4, + 10, 9, 8, + 14, 13, 12, + -1, -1, -1, -1)); + // 00000000 00000000 00000000 00000000 + // LLllllll KKKKkkkk JJJJJJjj IIiiiiii + // HHHHhhhh GGGGGGgg FFffffff EEEEeeee + // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa +} diff --git a/src/third-party/base64/lib/arch/ssse3/enc_loop.c b/src/third-party/base64/lib/arch/ssse3/enc_loop.c new file mode 100644 index 0000000..6de652e --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/enc_loop.c @@ -0,0 +1,67 @@ +static inline void +enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o) +{ + // Load input: + __m128i str = _mm_loadu_si128((__m128i *) *s); + + // Reshuffle: + str = enc_reshuffle(str); + + // Translate reshuffled bytes to the Base64 alphabet: + str = enc_translate(str); + + // Store: + _mm_storeu_si128((__m128i *) *o, str); + + *s += 12; + *o += 16; +} + +static inline void +enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 16) { + return; + } + + // Process blocks of 12 bytes at a time. Because blocks are loaded 16 + // bytes at a time, ensure that there will be at least 4 remaining + // bytes after the last round, so that the final read will not pass + // beyond the bounds of the input buffer: + size_t rounds = (*slen - 4) / 12; + + *slen -= rounds * 12; // 12 bytes consumed per round + *olen += rounds * 16; // 16 bytes produced per round + + do { + if (rounds >= 8) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_ssse3_inner(s, o); + enc_loop_ssse3_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_ssse3_inner(s, o); + break; + + } while (rounds > 0); +} diff --git a/src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c b/src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c new file mode 100644 index 0000000..b738591 --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/enc_reshuffle.c @@ -0,0 +1,48 @@ +static inline __m128i +enc_reshuffle (__m128i in) +{ + // Input, bytes MSB to LSB: + // 0 0 0 0 l k j i h g f e d c b a + + in = _mm_shuffle_epi8(in, _mm_set_epi8( + 10, 11, 9, 10, + 7, 8, 6, 7, + 4, 5, 3, 4, + 1, 2, 0, 1)); + // in, bytes MSB to LSB: + // k l j k + // h i g h + // e f d e + // b c a b + + const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00)); + // bits, upper case are most significant bits, lower case are least significant bits + // 0000kkkk LL000000 JJJJJJ00 00000000 + // 0000hhhh II000000 GGGGGG00 00000000 + // 0000eeee FF000000 DDDDDD00 00000000 + // 0000bbbb CC000000 AAAAAA00 00000000 + + const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040)); + // 00000000 00kkkkLL 00000000 00JJJJJJ + // 00000000 00hhhhII 00000000 00GGGGGG + // 00000000 00eeeeFF 00000000 00DDDDDD + // 00000000 00bbbbCC 00000000 00AAAAAA + + const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0)); + // 00000000 00llllll 000000jj KKKK0000 + // 00000000 00iiiiii 000000gg HHHH0000 + // 00000000 00ffffff 000000dd EEEE0000 + // 00000000 00cccccc 000000aa BBBB0000 + + const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010)); + // 00llllll 00000000 00jjKKKK 00000000 + // 00iiiiii 00000000 00ggHHHH 00000000 + // 00ffffff 00000000 00ddEEEE 00000000 + // 00cccccc 00000000 00aaBBBB 00000000 + + return _mm_or_si128(t1, t3); + // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ + // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG + // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD + // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA +} diff --git a/src/third-party/base64/lib/arch/ssse3/enc_translate.c b/src/third-party/base64/lib/arch/ssse3/enc_translate.c new file mode 100644 index 0000000..04f288f --- /dev/null +++ b/src/third-party/base64/lib/arch/ssse3/enc_translate.c @@ -0,0 +1,33 @@ +static inline __m128i +enc_translate (const __m128i in) +{ + // A lookup table containing the absolute offsets for all ranges: + const __m128i lut = _mm_setr_epi8( + 65, 71, -4, -4, + -4, -4, -4, -4, + -4, -4, -4, -4, + -19, -16, 0, 0 + ); + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from the input. The index for range #0 is right, + // others are 1 less than expected: + __m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51)); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + __m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25)); + + // Subtract -1, so add 1 to indices for range #[1..4]. All indices are + // now correct: + indices = _mm_sub_epi8(indices, mask); + + // Add offsets to input values: + return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices)); +} |