summaryrefslogtreecommitdiffstats
path: root/src/third-party/base64/lib/arch/avx2/dec_loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third-party/base64/lib/arch/avx2/dec_loop.c')
-rw-r--r--src/third-party/base64/lib/arch/avx2/dec_loop.c110
1 files changed, 110 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/avx2/dec_loop.c b/src/third-party/base64/lib/arch/avx2/dec_loop.c
new file mode 100644
index 0000000..f959fc4
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/dec_loop.c
@@ -0,0 +1,110 @@
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+ const __m256i lut_lo = _mm256_setr_epi8(
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+ const __m256i lut_hi = _mm256_setr_epi8(
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+ const __m256i lut_roll = _mm256_setr_epi8(
+ 0, 16, 19, 4, -65, -65, -71, -71,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 16, 19, 4, -65, -65, -71, -71,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+
+ const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+ // Load input:
+ __m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+ // See the SSSE3 decoder for an explanation of the algorithm.
+ const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+ const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+ const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+ const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+ if (!_mm256_testz_si256(lo, hi)) {
+ return 0;
+ }
+
+ const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+ const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+ // Now simply add the delta values to the input:
+ str = _mm256_add_epi8(str, roll);
+
+ // Reshuffle the input to packed 12-byte output format:
+ str = dec_reshuffle(str);
+
+ // Store the output:
+ _mm256_storeu_si256((__m256i *) *o, str);
+
+ *s += 32;
+ *o += 24;
+ *rounds -= 1;
+
+ return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ if (*slen < 45) {
+ return;
+ }
+
+ // Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+ // written after the output, ensure that there will be at least 13
+ // bytes of input data left to cover the gap. (11 data bytes and up to
+ // two end-of-string markers.)
+ size_t rounds = (*slen - 13) / 32;
+
+ *slen -= rounds * 32; // 32 bytes consumed per round
+ *olen += rounds * 24; // 24 bytes produced per round
+
+ do {
+ if (rounds >= 8) {
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds)) {
+ continue;
+ }
+ break;
+ }
+ if (rounds >= 4) {
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds)) {
+ continue;
+ }
+ break;
+ }
+ if (rounds >= 2) {
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
+ dec_loop_avx2_inner(s, o, &rounds)) {
+ continue;
+ }
+ break;
+ }
+ dec_loop_avx2_inner(s, o, &rounds);
+ break;
+
+ } while (rounds > 0);
+
+ // Adjust for any rounds that were skipped:
+ *slen += rounds * 32;
+ *olen -= rounds * 24;
+}