summaryrefslogtreecommitdiffstats
path: root/src/third-party/base64/lib/arch/neon32
diff options
context:
space:
mode:
Diffstat (limited to 'src/third-party/base64/lib/arch/neon32')
-rw-r--r--src/third-party/base64/lib/arch/neon32/codec.c77
-rw-r--r--src/third-party/base64/lib/arch/neon32/dec_loop.c106
-rw-r--r--src/third-party/base64/lib/arch/neon32/enc_loop.c169
-rw-r--r--src/third-party/base64/lib/arch/neon32/enc_reshuffle.c31
-rw-r--r--src/third-party/base64/lib/arch/neon32/enc_translate.c57
5 files changed, 440 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/neon32/codec.c b/src/third-party/base64/lib/arch/neon32/codec.c
new file mode 100644
index 0000000..a0b27f9
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon32/codec.c
@@ -0,0 +1,77 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#ifdef __arm__
+# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
+# define BASE64_USE_NEON32
+# endif
+#endif
+
+#ifdef BASE64_USE_NEON32
+#include <arm_neon.h>
+
+// Only enable inline assembly on supported compilers.
+#if defined(__GNUC__) || defined(__clang__)
+#define BASE64_NEON32_USE_ASM
+#endif
+
+static inline uint8x16_t
+vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
+{
+ // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
+ // the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
+ uint8x8x2_t lut2;
+ uint8x8x2_t result;
+
+ lut2.val[0] = vget_low_u8(lut);
+ lut2.val[1] = vget_high_u8(lut);
+
+ result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
+ result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
+
+ return vcombine_u8(result.val[0], result.val[1]);
+}
+
+#include "../generic/32/dec_loop.c"
+#include "../generic/32/enc_loop.c"
+#include "dec_loop.c"
+#include "enc_reshuffle.c"
+#include "enc_translate.c"
+#include "enc_loop.c"
+
+#endif // BASE64_USE_NEON32
+
+// Stride size is so large on these NEON 32-bit functions
+// (48 bytes encode, 32 bytes decode) that we inline the
+// uint32 codec to stay performant on smaller inputs.
+
+BASE64_ENC_FUNCTION(neon32)
+{
+#ifdef BASE64_USE_NEON32
+ #include "../generic/enc_head.c"
+ enc_loop_neon32(&s, &slen, &o, &olen);
+ enc_loop_generic_32(&s, &slen, &o, &olen);
+ #include "../generic/enc_tail.c"
+#else
+ BASE64_ENC_STUB
+#endif
+}
+
+BASE64_DEC_FUNCTION(neon32)
+{
+#ifdef BASE64_USE_NEON32
+ #include "../generic/dec_head.c"
+ dec_loop_neon32(&s, &slen, &o, &olen);
+ dec_loop_generic_32(&s, &slen, &o, &olen);
+ #include "../generic/dec_tail.c"
+#else
+ BASE64_DEC_STUB
+#endif
+}
diff --git a/src/third-party/base64/lib/arch/neon32/dec_loop.c b/src/third-party/base64/lib/arch/neon32/dec_loop.c
new file mode 100644
index 0000000..2216b39
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon32/dec_loop.c
@@ -0,0 +1,106 @@
+static inline int
+is_nonzero (const uint8x16_t v)
+{
+ uint64_t u64;
+ const uint64x2_t v64 = vreinterpretq_u64_u8(v);
+ const uint32x2_t v32 = vqmovn_u64(v64);
+
+ vst1_u64(&u64, vreinterpret_u64_u32(v32));
+ return u64 != 0;
+}
+
+static inline uint8x16_t
+delta_lookup (const uint8x16_t v)
+{
+ const uint8x8_t lut = {
+ 0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
+ };
+
+ return vcombine_u8(
+ vtbl1_u8(lut, vget_low_u8(v)),
+ vtbl1_u8(lut, vget_high_u8(v)));
+}
+
+static inline uint8x16_t
+dec_loop_neon32_lane (uint8x16_t *lane)
+{
+ // See the SSSE3 decoder for an explanation of the algorithm.
+ const uint8x16_t lut_lo = {
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
+ };
+
+ const uint8x16_t lut_hi = {
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+ };
+
+ const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
+ const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
+
+ const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
+ const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
+ const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
+
+ const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
+ const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
+
+ // Now simply add the delta values to the input:
+ *lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
+
+ // Return the validity mask:
+ return vandq_u8(lo, hi);
+}
+
+static inline void
+dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ if (*slen < 64) {
+ return;
+ }
+
+ // Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+ // extra trailing zero bytes are written, so it is not necessary to
+ // reserve extra input bytes:
+ size_t rounds = *slen / 64;
+
+ *slen -= rounds * 64; // 64 bytes consumed per round
+ *olen += rounds * 48; // 48 bytes produced per round
+
+ do {
+ uint8x16x3_t dec;
+
+ // Load 64 bytes and deinterleave:
+ uint8x16x4_t str = vld4q_u8(*s);
+
+ // Decode each lane, collect a mask of invalid inputs:
+ const uint8x16_t classified
+ = dec_loop_neon32_lane(&str.val[0])
+ | dec_loop_neon32_lane(&str.val[1])
+ | dec_loop_neon32_lane(&str.val[2])
+ | dec_loop_neon32_lane(&str.val[3]);
+
+ // Check for invalid input: if any of the delta values are
+ // zero, fall back on bytewise code to do error checking and
+ // reporting:
+ if (is_nonzero(classified)) {
+ break;
+ }
+
+ // Compress four bytes into three:
+ dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+ dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+ dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+ // Interleave and store decoded result:
+ vst3q_u8(*o, dec);
+
+ *s += 64;
+ *o += 48;
+
+ } while (--rounds > 0);
+
+ // Adjust for any rounds that were skipped:
+ *slen += rounds * 64;
+ *olen -= rounds * 48;
+}
diff --git a/src/third-party/base64/lib/arch/neon32/enc_loop.c b/src/third-party/base64/lib/arch/neon32/enc_loop.c
new file mode 100644
index 0000000..e9e8e28
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon32/enc_loop.c
@@ -0,0 +1,169 @@
+#ifdef BASE64_NEON32_USE_ASM
+static inline void
+enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
+{
+ // This function duplicates the functionality of enc_loop_neon32_inner,
+ // but entirely with inline assembly. This gives a significant speedup
+ // over using NEON intrinsics, which do not always generate very good
+ // code. The logic of the assembly is directly lifted from the
+ // intrinsics version, so it can be used as a guide to this code.
+
+ // Temporary registers, used as scratch space.
+ uint8x16_t tmp0, tmp1, tmp2, tmp3;
+ uint8x16_t mask0, mask1, mask2, mask3;
+
+ // A lookup table containing the absolute offsets for all ranges.
+ const uint8x16_t lut = {
+ 65U, 71U, 252U, 252U,
+ 252U, 252U, 252U, 252U,
+ 252U, 252U, 252U, 252U,
+ 237U, 240U, 0U, 0U
+ };
+
+ // Numeric constants.
+ const uint8x16_t n51 = vdupq_n_u8(51);
+ const uint8x16_t n25 = vdupq_n_u8(25);
+ const uint8x16_t n63 = vdupq_n_u8(63);
+
+ __asm__ (
+
+ // Load 48 bytes and deinterleave. The bytes are loaded to
+ // hard-coded registers q12, q13 and q14, to ensure that they
+ // are contiguous. Increment the source pointer.
+ "vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
+ "vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
+
+ // Reshuffle the bytes using temporaries.
+ "vshr.u8 %q[t0], q12, #2 \n\t"
+ "vshr.u8 %q[t1], q13, #4 \n\t"
+ "vshr.u8 %q[t2], q14, #6 \n\t"
+ "vsli.8 %q[t1], q12, #4 \n\t"
+ "vsli.8 %q[t2], q13, #2 \n\t"
+ "vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
+ "vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
+ "vand.u8 %q[t3], q14, %q[n63] \n\t"
+
+ // t0..t3 are the reshuffled inputs. Create LUT indices.
+ "vqsub.u8 q12, %q[t0], %q[n51] \n\t"
+ "vqsub.u8 q13, %q[t1], %q[n51] \n\t"
+ "vqsub.u8 q14, %q[t2], %q[n51] \n\t"
+ "vqsub.u8 q15, %q[t3], %q[n51] \n\t"
+
+ // Create the mask for range #0.
+ "vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
+ "vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
+ "vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
+ "vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
+
+ // Subtract -1 to correct the LUT indices.
+ "vsub.u8 q12, %q[m0] \n\t"
+ "vsub.u8 q13, %q[m1] \n\t"
+ "vsub.u8 q14, %q[m2] \n\t"
+ "vsub.u8 q15, %q[m3] \n\t"
+
+ // Lookup the delta values.
+ "vtbl.u8 d24, {%q[lut]}, d24 \n\t"
+ "vtbl.u8 d25, {%q[lut]}, d25 \n\t"
+ "vtbl.u8 d26, {%q[lut]}, d26 \n\t"
+ "vtbl.u8 d27, {%q[lut]}, d27 \n\t"
+ "vtbl.u8 d28, {%q[lut]}, d28 \n\t"
+ "vtbl.u8 d29, {%q[lut]}, d29 \n\t"
+ "vtbl.u8 d30, {%q[lut]}, d30 \n\t"
+ "vtbl.u8 d31, {%q[lut]}, d31 \n\t"
+
+ // Add the delta values.
+ "vadd.u8 q12, %q[t0] \n\t"
+ "vadd.u8 q13, %q[t1] \n\t"
+ "vadd.u8 q14, %q[t2] \n\t"
+ "vadd.u8 q15, %q[t3] \n\t"
+
+ // Store 64 bytes and interleave. Increment the dest pointer.
+ "vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
+ "vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
+
+ // Outputs (modified).
+ : [src] "+r" (*s),
+ [dst] "+r" (*o),
+ [t0] "=&w" (tmp0),
+ [t1] "=&w" (tmp1),
+ [t2] "=&w" (tmp2),
+ [t3] "=&w" (tmp3),
+ [m0] "=&w" (mask0),
+ [m1] "=&w" (mask1),
+ [m2] "=&w" (mask2),
+ [m3] "=&w" (mask3)
+
+ // Inputs (not modified).
+ : [lut] "w" (lut),
+ [n25] "w" (n25),
+ [n51] "w" (n51),
+ [n63] "w" (n63)
+
+ // Clobbers.
+ : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+ );
+}
+#endif
+
+static inline void
+enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
+{
+#ifdef BASE64_NEON32_USE_ASM
+ enc_loop_neon32_inner_asm(s, o);
+#else
+ // Load 48 bytes and deinterleave:
+ uint8x16x3_t src = vld3q_u8(*s);
+
+ // Reshuffle:
+ uint8x16x4_t out = enc_reshuffle(src);
+
+ // Translate reshuffled bytes to the Base64 alphabet:
+ out = enc_translate(out);
+
+ // Interleave and store output:
+ vst4q_u8(*o, out);
+
+ *s += 48;
+ *o += 64;
+#endif
+}
+
+static inline void
+enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ size_t rounds = *slen / 48;
+
+ *slen -= rounds * 48; // 48 bytes consumed per round
+ *olen += rounds * 64; // 64 bytes produced per round
+
+ while (rounds > 0) {
+ if (rounds >= 8) {
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ rounds -= 8;
+ continue;
+ }
+ if (rounds >= 4) {
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ rounds -= 4;
+ continue;
+ }
+ if (rounds >= 2) {
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ rounds -= 2;
+ continue;
+ }
+ enc_loop_neon32_inner(s, o);
+ break;
+ }
+}
diff --git a/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c b/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c
new file mode 100644
index 0000000..d6e97cb
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c
@@ -0,0 +1,31 @@
+static inline uint8x16x4_t
+enc_reshuffle (uint8x16x3_t in)
+{
+ uint8x16x4_t out;
+
+ // Input:
+ // in[0] = a7 a6 a5 a4 a3 a2 a1 a0
+ // in[1] = b7 b6 b5 b4 b3 b2 b1 b0
+ // in[2] = c7 c6 c5 c4 c3 c2 c1 c0
+
+ // Output:
+ // out[0] = 00 00 a7 a6 a5 a4 a3 a2
+ // out[1] = 00 00 a1 a0 b7 b6 b5 b4
+ // out[2] = 00 00 b3 b2 b1 b0 c7 c6
+ // out[3] = 00 00 c5 c4 c3 c2 c1 c0
+
+ // Move the input bits to where they need to be in the outputs. Except
+ // for the first output, the high two bits are not cleared.
+ out.val[0] = vshrq_n_u8(in.val[0], 2);
+ out.val[1] = vshrq_n_u8(in.val[1], 4);
+ out.val[2] = vshrq_n_u8(in.val[2], 6);
+ out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
+ out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
+
+ // Clear the high two bits in the second, third and fourth output.
+ out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
+ out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
+ out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
+
+ return out;
+}
diff --git a/src/third-party/base64/lib/arch/neon32/enc_translate.c b/src/third-party/base64/lib/arch/neon32/enc_translate.c
new file mode 100644
index 0000000..e616d54
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon32/enc_translate.c
@@ -0,0 +1,57 @@
+static inline uint8x16x4_t
+enc_translate (const uint8x16x4_t in)
+{
+ // A lookup table containing the absolute offsets for all ranges:
+ const uint8x16_t lut = {
+ 65U, 71U, 252U, 252U,
+ 252U, 252U, 252U, 252U,
+ 252U, 252U, 252U, 252U,
+ 237U, 240U, 0U, 0U
+ };
+
+ const uint8x16_t offset = vdupq_n_u8(51);
+
+ uint8x16x4_t indices, mask, delta, out;
+
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
+ // # From To Abs Index Characters
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
+ // 3 [62] [43] -19 12 +
+ // 4 [63] [47] -16 13 /
+
+ // Create LUT indices from input:
+ // the index for range #0 is right, others are 1 less than expected:
+ indices.val[0] = vqsubq_u8(in.val[0], offset);
+ indices.val[1] = vqsubq_u8(in.val[1], offset);
+ indices.val[2] = vqsubq_u8(in.val[2], offset);
+ indices.val[3] = vqsubq_u8(in.val[3], offset);
+
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+ mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
+ mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
+ mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
+ mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
+
+ // Subtract -1, so add 1 to indices for range #[1..4], All indices are
+ // now correct:
+ indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
+ indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
+ indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
+ indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
+
+ // Lookup delta values:
+ delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
+ delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
+ delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
+ delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
+
+ // Add delta values:
+ out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
+ out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
+ out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
+ out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
+
+ return out;
+}