diff options
Diffstat (limited to 'src/third-party/base64/lib/arch/neon32')
-rw-r--r-- | src/third-party/base64/lib/arch/neon32/codec.c | 77 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon32/dec_loop.c | 106 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon32/enc_loop.c | 169 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon32/enc_reshuffle.c | 31 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon32/enc_translate.c | 57 |
5 files changed, 440 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/neon32/codec.c b/src/third-party/base64/lib/arch/neon32/codec.c new file mode 100644 index 0000000..a0b27f9 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/codec.c @@ -0,0 +1,77 @@ +#include <stdint.h> +#include <stddef.h> +#include <string.h> + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#ifdef __arm__ +# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32 +# define BASE64_USE_NEON32 +# endif +#endif + +#ifdef BASE64_USE_NEON32 +#include <arm_neon.h> + +// Only enable inline assembly on supported compilers. +#if defined(__GNUC__) || defined(__clang__) +#define BASE64_NEON32_USE_ASM +#endif + +static inline uint8x16_t +vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices) +{ + // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate + // the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups. + uint8x8x2_t lut2; + uint8x8x2_t result; + + lut2.val[0] = vget_low_u8(lut); + lut2.val[1] = vget_high_u8(lut); + + result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices)); + result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices)); + + return vcombine_u8(result.val[0], result.val[1]); +} + +#include "../generic/32/dec_loop.c" +#include "../generic/32/enc_loop.c" +#include "dec_loop.c" +#include "enc_reshuffle.c" +#include "enc_translate.c" +#include "enc_loop.c" + +#endif // BASE64_USE_NEON32 + +// Stride size is so large on these NEON 32-bit functions +// (48 bytes encode, 32 bytes decode) that we inline the +// uint32 codec to stay performant on smaller inputs. + +BASE64_ENC_FUNCTION(neon32) +{ +#ifdef BASE64_USE_NEON32 + #include "../generic/enc_head.c" + enc_loop_neon32(&s, &slen, &o, &olen); + enc_loop_generic_32(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(neon32) +{ +#ifdef BASE64_USE_NEON32 + #include "../generic/dec_head.c" + dec_loop_neon32(&s, &slen, &o, &olen); + dec_loop_generic_32(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/neon32/dec_loop.c b/src/third-party/base64/lib/arch/neon32/dec_loop.c new file mode 100644 index 0000000..2216b39 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/dec_loop.c @@ -0,0 +1,106 @@ +static inline int +is_nonzero (const uint8x16_t v) +{ + uint64_t u64; + const uint64x2_t v64 = vreinterpretq_u64_u8(v); + const uint32x2_t v32 = vqmovn_u64(v64); + + vst1_u64(&u64, vreinterpret_u64_u32(v32)); + return u64 != 0; +} + +static inline uint8x16_t +delta_lookup (const uint8x16_t v) +{ + const uint8x8_t lut = { + 0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71, + }; + + return vcombine_u8( + vtbl1_u8(lut, vget_low_u8(v)), + vtbl1_u8(lut, vget_high_u8(v))); +} + +static inline uint8x16_t +dec_loop_neon32_lane (uint8x16_t *lane) +{ + // See the SSSE3 decoder for an explanation of the algorithm. + const uint8x16_t lut_lo = { + 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A + }; + + const uint8x16_t lut_hi = { + 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 + }; + + const uint8x16_t mask_0F = vdupq_n_u8(0x0F); + const uint8x16_t mask_2F = vdupq_n_u8(0x2F); + + const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4); + const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F); + const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F); + + const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles); + const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles); + + // Now simply add the delta values to the input: + *lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles))); + + // Return the validity mask: + return vandq_u8(lo, hi); +} + +static inline void +dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 64 bytes per round. Unlike the SSE codecs, no + // extra trailing zero bytes are written, so it is not necessary to + // reserve extra input bytes: + size_t rounds = *slen / 64; + + *slen -= rounds * 64; // 64 bytes consumed per round + *olen += rounds * 48; // 48 bytes produced per round + + do { + uint8x16x3_t dec; + + // Load 64 bytes and deinterleave: + uint8x16x4_t str = vld4q_u8(*s); + + // Decode each lane, collect a mask of invalid inputs: + const uint8x16_t classified + = dec_loop_neon32_lane(&str.val[0]) + | dec_loop_neon32_lane(&str.val[1]) + | dec_loop_neon32_lane(&str.val[2]) + | dec_loop_neon32_lane(&str.val[3]); + + // Check for invalid input: if any of the delta values are + // zero, fall back on bytewise code to do error checking and + // reporting: + if (is_nonzero(classified)) { + break; + } + + // Compress four bytes into three: + dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4)); + dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2)); + dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]); + + // Interleave and store decoded result: + vst3q_u8(*o, dec); + + *s += 64; + *o += 48; + + } while (--rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 64; + *olen -= rounds * 48; +} diff --git a/src/third-party/base64/lib/arch/neon32/enc_loop.c b/src/third-party/base64/lib/arch/neon32/enc_loop.c new file mode 100644 index 0000000..e9e8e28 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/enc_loop.c @@ -0,0 +1,169 @@ +#ifdef BASE64_NEON32_USE_ASM +static inline void +enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) +{ + // This function duplicates the functionality of enc_loop_neon32_inner, + // but entirely with inline assembly. This gives a significant speedup + // over using NEON intrinsics, which do not always generate very good + // code. The logic of the assembly is directly lifted from the + // intrinsics version, so it can be used as a guide to this code. + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + uint8x16_t mask0, mask1, mask2, mask3; + + // A lookup table containing the absolute offsets for all ranges. + const uint8x16_t lut = { + 65U, 71U, 252U, 252U, + 252U, 252U, 252U, 252U, + 252U, 252U, 252U, 252U, + 237U, 240U, 0U, 0U + }; + + // Numeric constants. + const uint8x16_t n51 = vdupq_n_u8(51); + const uint8x16_t n25 = vdupq_n_u8(25); + const uint8x16_t n63 = vdupq_n_u8(63); + + __asm__ ( + + // Load 48 bytes and deinterleave. The bytes are loaded to + // hard-coded registers q12, q13 and q14, to ensure that they + // are contiguous. Increment the source pointer. + "vld3.8 {d24, d26, d28}, [%[src]]! \n\t" + "vld3.8 {d25, d27, d29}, [%[src]]! \n\t" + + // Reshuffle the bytes using temporaries. + "vshr.u8 %q[t0], q12, #2 \n\t" + "vshr.u8 %q[t1], q13, #4 \n\t" + "vshr.u8 %q[t2], q14, #6 \n\t" + "vsli.8 %q[t1], q12, #4 \n\t" + "vsli.8 %q[t2], q13, #2 \n\t" + "vand.u8 %q[t1], %q[t1], %q[n63] \n\t" + "vand.u8 %q[t2], %q[t2], %q[n63] \n\t" + "vand.u8 %q[t3], q14, %q[n63] \n\t" + + // t0..t3 are the reshuffled inputs. Create LUT indices. + "vqsub.u8 q12, %q[t0], %q[n51] \n\t" + "vqsub.u8 q13, %q[t1], %q[n51] \n\t" + "vqsub.u8 q14, %q[t2], %q[n51] \n\t" + "vqsub.u8 q15, %q[t3], %q[n51] \n\t" + + // Create the mask for range #0. + "vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t" + "vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t" + "vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t" + "vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t" + + // Subtract -1 to correct the LUT indices. + "vsub.u8 q12, %q[m0] \n\t" + "vsub.u8 q13, %q[m1] \n\t" + "vsub.u8 q14, %q[m2] \n\t" + "vsub.u8 q15, %q[m3] \n\t" + + // Lookup the delta values. + "vtbl.u8 d24, {%q[lut]}, d24 \n\t" + "vtbl.u8 d25, {%q[lut]}, d25 \n\t" + "vtbl.u8 d26, {%q[lut]}, d26 \n\t" + "vtbl.u8 d27, {%q[lut]}, d27 \n\t" + "vtbl.u8 d28, {%q[lut]}, d28 \n\t" + "vtbl.u8 d29, {%q[lut]}, d29 \n\t" + "vtbl.u8 d30, {%q[lut]}, d30 \n\t" + "vtbl.u8 d31, {%q[lut]}, d31 \n\t" + + // Add the delta values. + "vadd.u8 q12, %q[t0] \n\t" + "vadd.u8 q13, %q[t1] \n\t" + "vadd.u8 q14, %q[t2] \n\t" + "vadd.u8 q15, %q[t3] \n\t" + + // Store 64 bytes and interleave. Increment the dest pointer. + "vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t" + "vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t" + + // Outputs (modified). + : [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3), + [m0] "=&w" (mask0), + [m1] "=&w" (mask1), + [m2] "=&w" (mask2), + [m3] "=&w" (mask3) + + // Inputs (not modified). + : [lut] "w" (lut), + [n25] "w" (n25), + [n51] "w" (n51), + [n63] "w" (n63) + + // Clobbers. + : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" + ); +} +#endif + +static inline void +enc_loop_neon32_inner (const uint8_t **s, uint8_t **o) +{ +#ifdef BASE64_NEON32_USE_ASM + enc_loop_neon32_inner_asm(s, o); +#else + // Load 48 bytes and deinterleave: + uint8x16x3_t src = vld3q_u8(*s); + + // Reshuffle: + uint8x16x4_t out = enc_reshuffle(src); + + // Translate reshuffled bytes to the Base64 alphabet: + out = enc_translate(out); + + // Interleave and store output: + vst4q_u8(*o, out); + + *s += 48; + *o += 64; +#endif +} + +static inline void +enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon32_inner(s, o); + enc_loop_neon32_inner(s, o); + rounds -= 2; + continue; + } + enc_loop_neon32_inner(s, o); + break; + } +} diff --git a/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c b/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c new file mode 100644 index 0000000..d6e97cb --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/enc_reshuffle.c @@ -0,0 +1,31 @@ +static inline uint8x16x4_t +enc_reshuffle (uint8x16x3_t in) +{ + uint8x16x4_t out; + + // Input: + // in[0] = a7 a6 a5 a4 a3 a2 a1 a0 + // in[1] = b7 b6 b5 b4 b3 b2 b1 b0 + // in[2] = c7 c6 c5 c4 c3 c2 c1 c0 + + // Output: + // out[0] = 00 00 a7 a6 a5 a4 a3 a2 + // out[1] = 00 00 a1 a0 b7 b6 b5 b4 + // out[2] = 00 00 b3 b2 b1 b0 c7 c6 + // out[3] = 00 00 c5 c4 c3 c2 c1 c0 + + // Move the input bits to where they need to be in the outputs. Except + // for the first output, the high two bits are not cleared. + out.val[0] = vshrq_n_u8(in.val[0], 2); + out.val[1] = vshrq_n_u8(in.val[1], 4); + out.val[2] = vshrq_n_u8(in.val[2], 6); + out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4); + out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2); + + // Clear the high two bits in the second, third and fourth output. + out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); + out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); + out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F)); + + return out; +} diff --git a/src/third-party/base64/lib/arch/neon32/enc_translate.c b/src/third-party/base64/lib/arch/neon32/enc_translate.c new file mode 100644 index 0000000..e616d54 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon32/enc_translate.c @@ -0,0 +1,57 @@ +static inline uint8x16x4_t +enc_translate (const uint8x16x4_t in) +{ + // A lookup table containing the absolute offsets for all ranges: + const uint8x16_t lut = { + 65U, 71U, 252U, 252U, + 252U, 252U, 252U, 252U, + 252U, 252U, 252U, 252U, + 237U, 240U, 0U, 0U + }; + + const uint8x16_t offset = vdupq_n_u8(51); + + uint8x16x4_t indices, mask, delta, out; + + // Translate values 0..63 to the Base64 alphabet. There are five sets: + // # From To Abs Index Characters + // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ + // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz + // 2 [52..61] [48..57] -4 [2..11] 0123456789 + // 3 [62] [43] -19 12 + + // 4 [63] [47] -16 13 / + + // Create LUT indices from input: + // the index for range #0 is right, others are 1 less than expected: + indices.val[0] = vqsubq_u8(in.val[0], offset); + indices.val[1] = vqsubq_u8(in.val[1], offset); + indices.val[2] = vqsubq_u8(in.val[2], offset); + indices.val[3] = vqsubq_u8(in.val[3], offset); + + // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: + mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25)); + mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25)); + mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25)); + mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25)); + + // Subtract -1, so add 1 to indices for range #[1..4], All indices are + // now correct: + indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]); + indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]); + indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]); + indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]); + + // Lookup delta values: + delta.val[0] = vqtbl1q_u8(lut, indices.val[0]); + delta.val[1] = vqtbl1q_u8(lut, indices.val[1]); + delta.val[2] = vqtbl1q_u8(lut, indices.val[2]); + delta.val[3] = vqtbl1q_u8(lut, indices.val[3]); + + // Add delta values: + out.val[0] = vaddq_u8(in.val[0], delta.val[0]); + out.val[1] = vaddq_u8(in.val[1], delta.val[1]); + out.val[2] = vaddq_u8(in.val[2], delta.val[2]); + out.val[3] = vaddq_u8(in.val[3], delta.val[3]); + + return out; +} |