summaryrefslogtreecommitdiffstats
path: root/src/third-party/base64/lib/arch/avx2/enc_loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third-party/base64/lib/arch/avx2/enc_loop.c')
-rw-r--r--src/third-party/base64/lib/arch/avx2/enc_loop.c89
1 files changed, 89 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/avx2/enc_loop.c b/src/third-party/base64/lib/arch/avx2/enc_loop.c
new file mode 100644
index 0000000..b9e2736
--- /dev/null
+++ b/src/third-party/base64/lib/arch/avx2/enc_loop.c
@@ -0,0 +1,89 @@
+static inline void
+enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
+{
+ // First load is done at s - 0 to not get a segfault:
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+ // Shift by 4 bytes, as required by enc_reshuffle:
+ src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
+
+ // Reshuffle, translate, store:
+ src = enc_reshuffle(src);
+ src = enc_translate(src);
+ _mm256_storeu_si256((__m256i *) *o, src);
+
+ // Subsequent loads will be done at s - 4, set pointer for next round:
+ *s += 20;
+ *o += 32;
+}
+
+static inline void
+enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
+{
+ // Load input:
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+ // Reshuffle, translate, store:
+ src = enc_reshuffle(src);
+ src = enc_translate(src);
+ _mm256_storeu_si256((__m256i *) *o, src);
+
+ *s += 24;
+ *o += 32;
+}
+
+static inline void
+enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ if (*slen < 32) {
+ return;
+ }
+
+ // Process blocks of 24 bytes at a time. Because blocks are loaded 32
+ // bytes at a time an offset of -4, ensure that there will be at least
+ // 4 remaining bytes after the last round, so that the final read will
+ // not pass beyond the bounds of the input buffer:
+ size_t rounds = (*slen - 4) / 24;
+
+ *slen -= rounds * 24; // 24 bytes consumed per round
+ *olen += rounds * 32; // 32 bytes produced per round
+
+ // The first loop iteration requires special handling to ensure that
+ // the read, which is done at an offset, does not underflow the buffer:
+ enc_loop_avx2_inner_first(s, o);
+ rounds--;
+
+ while (rounds > 0) {
+ if (rounds >= 8) {
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ rounds -= 8;
+ continue;
+ }
+ if (rounds >= 4) {
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ rounds -= 4;
+ continue;
+ }
+ if (rounds >= 2) {
+ enc_loop_avx2_inner(s, o);
+ enc_loop_avx2_inner(s, o);
+ rounds -= 2;
+ continue;
+ }
+ enc_loop_avx2_inner(s, o);
+ break;
+ }
+
+ // Add the offset back:
+ *s += 4;
+}