summaryrefslogtreecommitdiffstats
path: root/src/third-party/base64/lib/arch/neon64/codec.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third-party/base64/lib/arch/neon64/codec.c')
-rw-r--r--src/third-party/base64/lib/arch/neon64/codec.c92
1 files changed, 92 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/neon64/codec.c b/src/third-party/base64/lib/arch/neon64/codec.c
new file mode 100644
index 0000000..fc953b2
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon64/codec.c
@@ -0,0 +1,92 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#ifdef __aarch64__
+# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
+# define BASE64_USE_NEON64
+# endif
+#endif
+
+#ifdef BASE64_USE_NEON64
+#include <arm_neon.h>
+
+// Only enable inline assembly on supported compilers.
+#if defined(__GNUC__) || defined(__clang__)
+#define BASE64_NEON64_USE_ASM
+#endif
+
+static inline uint8x16x4_t
+load_64byte_table (const uint8_t *p)
+{
+#ifdef BASE64_NEON64_USE_ASM
+
+ // Force the table to be loaded into contiguous registers. GCC will not
+ // normally allocate contiguous registers for a `uint8x16x4_t'. These
+ // registers are chosen to not conflict with the ones in the enc loop.
+ register uint8x16_t t0 __asm__ ("v8");
+ register uint8x16_t t1 __asm__ ("v9");
+ register uint8x16_t t2 __asm__ ("v10");
+ register uint8x16_t t3 __asm__ ("v11");
+
+ __asm__ (
+ "ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
+ : [src] "+r" (p),
+ [t0] "=w" (t0),
+ [t1] "=w" (t1),
+ [t2] "=w" (t2),
+ [t3] "=w" (t3)
+ );
+
+ return (uint8x16x4_t) {
+ .val[0] = t0,
+ .val[1] = t1,
+ .val[2] = t2,
+ .val[3] = t3,
+ };
+#else
+ return vld1q_u8_x4(p);
+#endif
+}
+
+#include "../generic/32/dec_loop.c"
+#include "../generic/64/enc_loop.c"
+#include "dec_loop.c"
+#include "enc_reshuffle.c"
+#include "enc_loop.c"
+
+#endif // BASE64_USE_NEON64
+
+// Stride size is so large on these NEON 64-bit functions
+// (48 bytes encode, 64 bytes decode) that we inline the
+// uint64 codec to stay performant on smaller inputs.
+
+BASE64_ENC_FUNCTION(neon64)
+{
+#ifdef BASE64_USE_NEON64
+ #include "../generic/enc_head.c"
+ enc_loop_neon64(&s, &slen, &o, &olen);
+ enc_loop_generic_64(&s, &slen, &o, &olen);
+ #include "../generic/enc_tail.c"
+#else
+ BASE64_ENC_STUB
+#endif
+}
+
+BASE64_DEC_FUNCTION(neon64)
+{
+#ifdef BASE64_USE_NEON64
+ #include "../generic/dec_head.c"
+ dec_loop_neon64(&s, &slen, &o, &olen);
+ dec_loop_generic_32(&s, &slen, &o, &olen);
+ #include "../generic/dec_tail.c"
+#else
+ BASE64_DEC_STUB
+#endif
+}