diff options
Diffstat (limited to 'src/third-party/base64/lib/arch/neon64')
-rw-r--r-- | src/third-party/base64/lib/arch/neon64/codec.c | 92 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon64/dec_loop.c | 129 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon64/enc_loop.c | 133 | ||||
-rw-r--r-- | src/third-party/base64/lib/arch/neon64/enc_reshuffle.c | 31 |
4 files changed, 385 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/neon64/codec.c b/src/third-party/base64/lib/arch/neon64/codec.c new file mode 100644 index 0000000..fc953b2 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/codec.c @@ -0,0 +1,92 @@ +#include <stdint.h> +#include <stddef.h> +#include <string.h> + +#include "../../../include/libbase64.h" +#include "../../tables/tables.h" +#include "../../codecs.h" +#include "config.h" +#include "../../env.h" + +#ifdef __aarch64__ +# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64 +# define BASE64_USE_NEON64 +# endif +#endif + +#ifdef BASE64_USE_NEON64 +#include <arm_neon.h> + +// Only enable inline assembly on supported compilers. +#if defined(__GNUC__) || defined(__clang__) +#define BASE64_NEON64_USE_ASM +#endif + +static inline uint8x16x4_t +load_64byte_table (const uint8_t *p) +{ +#ifdef BASE64_NEON64_USE_ASM + + // Force the table to be loaded into contiguous registers. GCC will not + // normally allocate contiguous registers for a `uint8x16x4_t'. These + // registers are chosen to not conflict with the ones in the enc loop. + register uint8x16_t t0 __asm__ ("v8"); + register uint8x16_t t1 __asm__ ("v9"); + register uint8x16_t t2 __asm__ ("v10"); + register uint8x16_t t3 __asm__ ("v11"); + + __asm__ ( + "ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t" + : [src] "+r" (p), + [t0] "=w" (t0), + [t1] "=w" (t1), + [t2] "=w" (t2), + [t3] "=w" (t3) + ); + + return (uint8x16x4_t) { + .val[0] = t0, + .val[1] = t1, + .val[2] = t2, + .val[3] = t3, + }; +#else + return vld1q_u8_x4(p); +#endif +} + +#include "../generic/32/dec_loop.c" +#include "../generic/64/enc_loop.c" +#include "dec_loop.c" +#include "enc_reshuffle.c" +#include "enc_loop.c" + +#endif // BASE64_USE_NEON64 + +// Stride size is so large on these NEON 64-bit functions +// (48 bytes encode, 64 bytes decode) that we inline the +// uint64 codec to stay performant on smaller inputs. + +BASE64_ENC_FUNCTION(neon64) +{ +#ifdef BASE64_USE_NEON64 + #include "../generic/enc_head.c" + enc_loop_neon64(&s, &slen, &o, &olen); + enc_loop_generic_64(&s, &slen, &o, &olen); + #include "../generic/enc_tail.c" +#else + BASE64_ENC_STUB +#endif +} + +BASE64_DEC_FUNCTION(neon64) +{ +#ifdef BASE64_USE_NEON64 + #include "../generic/dec_head.c" + dec_loop_neon64(&s, &slen, &o, &olen); + dec_loop_generic_32(&s, &slen, &o, &olen); + #include "../generic/dec_tail.c" +#else + BASE64_DEC_STUB +#endif +} diff --git a/src/third-party/base64/lib/arch/neon64/dec_loop.c b/src/third-party/base64/lib/arch/neon64/dec_loop.c new file mode 100644 index 0000000..48232f2 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/dec_loop.c @@ -0,0 +1,129 @@ +// The input consists of five valid character sets in the Base64 alphabet, +// which we need to map back to the 6-bit values they represent. +// There are three ranges, two singles, and then there's the rest. +// +// # From To LUT Characters +// 1 [0..42] [255] #1 invalid input +// 2 [43] [62] #1 + +// 3 [44..46] [255] #1 invalid input +// 4 [47] [63] #1 / +// 5 [48..57] [52..61] #1 0..9 +// 6 [58..63] [255] #1 invalid input +// 7 [64] [255] #2 invalid input +// 8 [65..90] [0..25] #2 A..Z +// 9 [91..96] [255] #2 invalid input +// 10 [97..122] [26..51] #2 a..z +// 11 [123..126] [255] #2 invalid input +// (12) Everything else => invalid input + +// The first LUT will use the VTBL instruction (out of range indices are set to +// 0 in destination). +static const uint8_t dec_lut1[] = { + 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, + 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, + 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U, + 52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U, +}; + +// The second LUT will use the VTBX instruction (out of range indices will be +// unchanged in destination). Input [64..126] will be mapped to index [1..63] +// in this LUT. Index 0 means that value comes from LUT #1. +static const uint8_t dec_lut2[] = { + 0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U, + 14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U, + 255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U, + 40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U, +}; + +// All input values in range for the first look-up will be 0U in the second +// look-up result. All input values out of range for the first look-up will be +// 0U in the first look-up result. Thus, the two results can be ORed without +// conflicts. +// +// Invalid characters that are in the valid range for either look-up will be +// set to 255U in the combined result. Other invalid characters will just be +// passed through with the second look-up result (using the VTBX instruction). +// Since the second LUT is 64 bytes, those passed-through values are guaranteed +// to have a value greater than 63U. Therefore, valid characters will be mapped +// to the valid [0..63] range and all invalid characters will be mapped to +// values greater than 63. + +static inline void +dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + if (*slen < 64) { + return; + } + + // Process blocks of 64 bytes per round. Unlike the SSE codecs, no + // extra trailing zero bytes are written, so it is not necessary to + // reserve extra input bytes: + size_t rounds = *slen / 64; + + *slen -= rounds * 64; // 64 bytes consumed per round + *olen += rounds * 48; // 48 bytes produced per round + + const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1); + const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2); + + do { + const uint8x16_t offset = vdupq_n_u8(63U); + uint8x16x4_t dec1, dec2; + uint8x16x3_t dec; + + // Load 64 bytes and deinterleave: + uint8x16x4_t str = vld4q_u8((uint8_t *) *s); + + // Get indices for second LUT: + dec2.val[0] = vqsubq_u8(str.val[0], offset); + dec2.val[1] = vqsubq_u8(str.val[1], offset); + dec2.val[2] = vqsubq_u8(str.val[2], offset); + dec2.val[3] = vqsubq_u8(str.val[3], offset); + + // Get values from first LUT: + dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]); + dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]); + dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]); + dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]); + + // Get values from second LUT: + dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]); + dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]); + dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]); + dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]); + + // Get final values: + str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]); + str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]); + str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]); + str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]); + + // Check for invalid input, any value larger than 63: + const uint8x16_t classified + = vcgtq_u8(str.val[0], vdupq_n_u8(63)) + | vcgtq_u8(str.val[1], vdupq_n_u8(63)) + | vcgtq_u8(str.val[2], vdupq_n_u8(63)) + | vcgtq_u8(str.val[3], vdupq_n_u8(63)); + + // Check that all bits are zero: + if (vmaxvq_u8(classified) != 0U) { + break; + } + + // Compress four bytes into three: + dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4); + dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2); + dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3]; + + // Interleave and store decoded result: + vst3q_u8((uint8_t *) *o, dec); + + *s += 64; + *o += 48; + + } while (--rounds > 0); + + // Adjust for any rounds that were skipped: + *slen += rounds * 64; + *olen -= rounds * 48; +} diff --git a/src/third-party/base64/lib/arch/neon64/enc_loop.c b/src/third-party/base64/lib/arch/neon64/enc_loop.c new file mode 100644 index 0000000..d1862f7 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/enc_loop.c @@ -0,0 +1,133 @@ +#ifdef BASE64_NEON64_USE_ASM +static inline void +enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) +{ + // This function duplicates the functionality of enc_loop_neon64_inner, + // but entirely with inline assembly. This gives a significant speedup + // over using NEON intrinsics, which do not always generate very good + // code. The logic of the assembly is directly lifted from the + // intrinsics version, so it can be used as a guide to this code. + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + + // Numeric constant. + const uint8x16_t n63 = vdupq_n_u8(63); + + __asm__ ( + + // Load 48 bytes and deinterleave. The bytes are loaded to + // hard-coded registers v12, v13 and v14, to ensure that they + // are contiguous. Increment the source pointer. + "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" + + // Reshuffle the bytes using temporaries. + "ushr %[t0].16b, v12.16b, #2 \n\t" + "ushr %[t1].16b, v13.16b, #4 \n\t" + "ushr %[t2].16b, v14.16b, #6 \n\t" + "sli %[t1].16b, v12.16b, #4 \n\t" + "sli %[t2].16b, v13.16b, #2 \n\t" + "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" + "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" + "and %[t3].16b, v14.16b, %[n63].16b \n\t" + + // Translate the values to the Base64 alphabet. + "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" + "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" + "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" + "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" + + // Store 64 bytes and interleave. Increment the dest pointer. + "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" + + // Outputs (modified). + : [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3) + + // Inputs (not modified). + : [n63] "w" (n63), + [l0] "w" (tbl_enc.val[0]), + [l1] "w" (tbl_enc.val[1]), + [l2] "w" (tbl_enc.val[2]), + [l3] "w" (tbl_enc.val[3]) + + // Clobbers. + : "v12", "v13", "v14", "v15" + ); +} +#endif + +static inline void +enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) +{ +#ifdef BASE64_NEON64_USE_ASM + enc_loop_neon64_inner_asm(s, o, tbl_enc); +#else + // Load 48 bytes and deinterleave: + uint8x16x3_t src = vld3q_u8(*s); + + // Divide bits of three input bytes over four output bytes: + uint8x16x4_t out = enc_reshuffle(src); + + // The bits have now been shifted to the right locations; + // translate their values 0..63 to the Base64 alphabet. + // Use a 64-byte table lookup: + out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]); + out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]); + out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]); + out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]); + + // Interleave and store output: + vst4q_u8(*o, out); + + *s += 48; + *o += 64; +#endif +} + +static inline void +enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round + *olen += rounds * 64; // 64 bytes produced per round + + // Load the encoding table: + const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit); + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 2; + continue; + } + enc_loop_neon64_inner(s, o, tbl_enc); + break; + } +} diff --git a/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c b/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c new file mode 100644 index 0000000..ea543e0 --- /dev/null +++ b/src/third-party/base64/lib/arch/neon64/enc_reshuffle.c @@ -0,0 +1,31 @@ +static inline uint8x16x4_t +enc_reshuffle (const uint8x16x3_t in) +{ + uint8x16x4_t out; + + // Input: + // in[0] = a7 a6 a5 a4 a3 a2 a1 a0 + // in[1] = b7 b6 b5 b4 b3 b2 b1 b0 + // in[2] = c7 c6 c5 c4 c3 c2 c1 c0 + + // Output: + // out[0] = 00 00 a7 a6 a5 a4 a3 a2 + // out[1] = 00 00 a1 a0 b7 b6 b5 b4 + // out[2] = 00 00 b3 b2 b1 b0 c7 c6 + // out[3] = 00 00 c5 c4 c3 c2 c1 c0 + + // Move the input bits to where they need to be in the outputs. Except + // for the first output, the high two bits are not cleared. + out.val[0] = vshrq_n_u8(in.val[0], 2); + out.val[1] = vshrq_n_u8(in.val[1], 4); + out.val[2] = vshrq_n_u8(in.val[2], 6); + out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4); + out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2); + + // Clear the high two bits in the second, third and fourth output. + out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F)); + out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F)); + out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F)); + + return out; +} |