summaryrefslogtreecommitdiffstats
path: root/src/third-party/base64/lib/arch/avx2
diff options
context:
space:
mode:
Diffstat (limited to 'src/third-party/base64/lib/arch/avx2')
-rw-r--r--src/third-party/base64/lib/arch/avx2/codec.c42
-rw-r--r--src/third-party/base64/lib/arch/avx2/dec_loop.c110
-rw-r--r--src/third-party/base64/lib/arch/avx2/dec_reshuffle.c34
-rw-r--r--src/third-party/base64/lib/arch/avx2/enc_loop.c89
-rw-r--r--src/third-party/base64/lib/arch/avx2/enc_reshuffle.c83
-rw-r--r--src/third-party/base64/lib/arch/avx2/enc_translate.c30
6 files changed, 388 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/avx2/codec.c b/src/third-party/base64/lib/arch/avx2/codec.c
new file mode 100644
index 0000000..0498548
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/codec.c
@@ -0,0 +1,42 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX2
+#include <immintrin.h>
+
+#include "dec_reshuffle.c"
+#include "dec_loop.c"
+#include "enc_translate.c"
+#include "enc_reshuffle.c"
+#include "enc_loop.c"
+
+#endif // HAVE_AVX2
+
+BASE64_ENC_FUNCTION(avx2)
+{
+#if HAVE_AVX2
+ #include "../generic/enc_head.c"
+ enc_loop_avx2(&s, &slen, &o, &olen);
+ #include "../generic/enc_tail.c"
+#else
+ BASE64_ENC_STUB
+#endif
+}
+
+BASE64_DEC_FUNCTION(avx2)
+{
+#if HAVE_AVX2
+ #include "../generic/dec_head.c"
+ dec_loop_avx2(&s, &slen, &o, &olen);
+ #include "../generic/dec_tail.c"
+#else
+ BASE64_DEC_STUB
+#endif
+}
diff --git a/src/third-party/base64/lib/arch/avx2/dec_loop.c b/src/third-party/base64/lib/arch/avx2/dec_loop.c
new file mode 100644
index 0000000..f959fc4
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/dec_loop.c
@@ -0,0 +1,110 @@
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+ const __m256i lut_lo = _mm256_setr_epi8(
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+ const __m256i lut_hi = _mm256_setr_epi8(
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+ const __m256i lut_roll = _mm256_setr_epi8(
+ 0, 16, 19, 4, -65, -65, -71, -71,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 16, 19, 4, -65, -65, -71, -71,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+
+ const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+ // Load input:
+ __m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+ // See the SSSE3 decoder for an explanation of the algorithm.
+ const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+ const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+ const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+ const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+ if (!_mm256_testz_si256(lo, hi)) {
+ return 0;
+ }
+
+ const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+ const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+ // Now simply add the delta values to the input:
+ str = _mm256_add_epi8(str, roll);
+
+ // Reshuffle the input to packed 12-byte output format:
+ str = dec_reshuffle(str);
+
+ // Store the output:
+ _mm256_storeu_si256((__m256i *) *o, str);
+
+ *s += 32;
+ *o += 24;
+ *rounds -= 1;
+
+ return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ if (*slen < 45) {
+ return;
+ }
+
+ // Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+ // written after the output, ensure that there will be at least 13
+ // bytes of input data left to cover the gap. (11 data bytes and up to
+ // two end-of-string markers.)
+ size_t rounds = (*slen - 13) / 32;
+
+ *slen -= rounds * 32; // 32 bytes consumed per round
+ *olen += rounds * 24; // 24 bytes produced per round
+
+ do {
+ if (rounds >= 8) {
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds)) {
+ continue;
+ }
+ break;
+ }
+ if (rounds >= 4) {
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds)) {
+ continue;
+ }
+ break;
+ }
+ if (rounds >= 2) {
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds)) {
+ continue;
+ }
+ break;
+ }
+ dec_loop_avx2_inner(s, o, &rounds);
+ break;
+
+ } while (rounds > 0);
+
+ // Adjust for any rounds that were skipped:
+ *slen += rounds * 32;
+ *olen -= rounds * 24;
+}
diff --git a/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c
new file mode 100644
index 0000000..f351809
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/dec_reshuffle.c
@@ -0,0 +1,34 @@
+static inline __m256i
+dec_reshuffle (const __m256i in)
+{
+ // in, lower lane, bits, upper case are most significant bits, lower
+ // case are least significant bits:
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+ const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
+ // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+ // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+ // 0000eeee FFffffff 0000DDDD DDddEEEE
+ // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+ __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+ // 00000000 JJJJJJjj KKKKkkkk LLllllll
+ // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+ // 00000000 DDDDDDdd EEEEeeee FFffffff
+ // 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+ // Pack bytes together in each lane:
+ out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
+ // 00000000 00000000 00000000 00000000
+ // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+ // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+ // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+ // Pack lanes:
+ return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}
diff --git a/src/third-party/base64/lib/arch/avx2/enc_loop.c b/src/third-party/base64/lib/arch/avx2/enc_loop.c
new file mode 100644
index 0000000..b9e2736
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_loop.c
@@ -0,0 +1,89 @@
+static inline void
+enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
+{
+ // First load is done at s - 0 to not get a segfault:
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+ // Shift by 4 bytes, as required by enc_reshuffle:
+ src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
+
+ // Reshuffle, translate, store:
+ src = enc_reshuffle(src);
+ src = enc_translate(src);
+ _mm256_storeu_si256((__m256i *) *o, src);
+
+ // Subsequent loads will be done at s - 4, set pointer for next round:
+ *s += 20;
+ *o += 32;
+}
+
+static inline void
+enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
+{
+ // Load input:
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+ // Reshuffle, translate, store:
+ src = enc_reshuffle(src);
+ src = enc_translate(src);
+ _mm256_storeu_si256((__m256i *) *o, src);
+
+ *s += 24;
+ *o += 32;
+}
+
+static inline void
+enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ if (*slen < 32) {
+ return;
+ }
+
+ // Process blocks of 24 bytes at a time. Because blocks are loaded 32
+ // bytes at a time an offset of -4, ensure that there will be at least
+ // 4 remaining bytes after the last round, so that the final read will
+ // not pass beyond the bounds of the input buffer:
+ size_t rounds = (*slen - 4) / 24;
+
+ *slen -= rounds * 24; // 24 bytes consumed per round
+ *olen += rounds * 32; // 32 bytes produced per round
+
+ // The first loop iteration requires special handling to ensure that
+ // the read, which is done at an offset, does not underflow the buffer:
+ enc_loop_avx2_inner_first(s, o);
+ rounds--;
+
+ while (rounds > 0) {
+ if (rounds >= 8) {
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ rounds -= 8;
+ continue;
+ }
+ if (rounds >= 4) {
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ rounds -= 4;
+ continue;
+ }
+ if (rounds >= 2) {
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ rounds -= 2;
+ continue;
+ }
+ enc_loop_avx2_inner(s, o);
+ break;
+ }
+
+ // Add the offset back:
+ *s += 4;
+}
diff --git a/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c
new file mode 100644
index 0000000..ba16690
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_reshuffle.c
@@ -0,0 +1,83 @@
+static inline __m256i
+enc_reshuffle (const __m256i input)
+{
+ // Translation of the SSSE3 reshuffling algorithm to AVX2. This one
+ // works with shifted (4 bytes) input in order to be able to work
+ // efficiently in the two 128-bit lanes.
+
+ // Input, bytes MSB to LSB:
+ // 0 0 0 0 x w v u t s r q p o n m
+ // l k j i h g f e d c b a 0 0 0 0
+
+ const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
+ 10, 11, 9, 10,
+ 7, 8, 6, 7,
+ 4, 5, 3, 4,
+ 1, 2, 0, 1,
+
+ 14, 15, 13, 14,
+ 11, 12, 10, 11,
+ 8, 9, 7, 8,
+ 5, 6, 4, 5));
+ // in, bytes MSB to LSB:
+ // w x v w
+ // t u s t
+ // q r p q
+ // n o m n
+ // k l j k
+ // h i g h
+ // e f d e
+ // b c a b
+
+ const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
+ // bits, upper case are most significant bits, lower case are least
+ // significant bits.
+ // 0000wwww XX000000 VVVVVV00 00000000
+ // 0000tttt UU000000 SSSSSS00 00000000
+ // 0000qqqq RR000000 PPPPPP00 00000000
+ // 0000nnnn OO000000 MMMMMM00 00000000
+ // 0000kkkk LL000000 JJJJJJ00 00000000
+ // 0000hhhh II000000 GGGGGG00 00000000
+ // 0000eeee FF000000 DDDDDD00 00000000
+ // 0000bbbb CC000000 AAAAAA00 00000000
+
+ const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+ // 00000000 00wwwwXX 00000000 00VVVVVV
+ // 00000000 00ttttUU 00000000 00SSSSSS
+ // 00000000 00qqqqRR 00000000 00PPPPPP
+ // 00000000 00nnnnOO 00000000 00MMMMMM
+ // 00000000 00kkkkLL 00000000 00JJJJJJ
+ // 00000000 00hhhhII 00000000 00GGGGGG
+ // 00000000 00eeeeFF 00000000 00DDDDDD
+ // 00000000 00bbbbCC 00000000 00AAAAAA
+
+ const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
+ // 00000000 00xxxxxx 000000vv WWWW0000
+ // 00000000 00uuuuuu 000000ss TTTT0000
+ // 00000000 00rrrrrr 000000pp QQQQ0000
+ // 00000000 00oooooo 000000mm NNNN0000
+ // 00000000 00llllll 000000jj KKKK0000
+ // 00000000 00iiiiii 000000gg HHHH0000
+ // 00000000 00ffffff 000000dd EEEE0000
+ // 00000000 00cccccc 000000aa BBBB0000
+
+ const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+ // 00xxxxxx 00000000 00vvWWWW 00000000
+ // 00uuuuuu 00000000 00ssTTTT 00000000
+ // 00rrrrrr 00000000 00ppQQQQ 00000000
+ // 00oooooo 00000000 00mmNNNN 00000000
+ // 00llllll 00000000 00jjKKKK 00000000
+ // 00iiiiii 00000000 00ggHHHH 00000000
+ // 00ffffff 00000000 00ddEEEE 00000000
+ // 00cccccc 00000000 00aaBBBB 00000000
+
+ return _mm256_or_si256(t1, t3);
+ // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
+ // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
+ // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
+ // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+}
diff --git a/src/third-party/base64/lib/arch/avx2/enc_translate.c b/src/third-party/base64/lib/arch/avx2/enc_translate.c
new file mode 100644
index 0000000..46173cd
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_translate.c
@@ -0,0 +1,30 @@
+static inline __m256i
+enc_translate (const __m256i in)
+{
+ // A lookup table containing the absolute offsets for all ranges:
+ const __m256i lut = _mm256_setr_epi8(
+ 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
+ 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
+ // # From To Abs Index Characters
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
+ // 3 [62] [43] -19 12 +
+ // 4 [63] [47] -16 13 /
+
+ // Create LUT indices from the input. The index for range #0 is right,
+ // others are 1 less than expected:
+ __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
+
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+ const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
+
+ // Subtract -1, so add 1 to indices for range #[1..4]. All indices are
+ // now correct:
+ indices = _mm256_sub_epi8(indices, mask);
+
+ // Add offsets to input values:
+ return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
+}