summaryrefslogtreecommitdiffstats
path: root/src/third-party/base64/lib/arch/neon32/enc_loop.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third-party/base64/lib/arch/neon32/enc_loop.c')
-rw-r--r--src/third-party/base64/lib/arch/neon32/enc_loop.c169
1 files changed, 169 insertions, 0 deletions
diff --git a/src/third-party/base64/lib/arch/neon32/enc_loop.c b/src/third-party/base64/lib/arch/neon32/enc_loop.c
new file mode 100644
index 0000000..e9e8e28
--- /dev/null
+++ b/src/third-party/base64/lib/arch/neon32/enc_loop.c
@@ -0,0 +1,169 @@
+#ifdef BASE64_NEON32_USE_ASM
+static inline void
+enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
+{
+ // This function duplicates the functionality of enc_loop_neon32_inner,
+ // but entirely with inline assembly. This gives a significant speedup
+ // over using NEON intrinsics, which do not always generate very good
+ // code. The logic of the assembly is directly lifted from the
+ // intrinsics version, so it can be used as a guide to this code.
+
+ // Temporary registers, used as scratch space.
+ uint8x16_t tmp0, tmp1, tmp2, tmp3;
+ uint8x16_t mask0, mask1, mask2, mask3;
+
+ // A lookup table containing the absolute offsets for all ranges.
+ const uint8x16_t lut = {
+ 65U, 71U, 252U, 252U,
+ 252U, 252U, 252U, 252U,
+ 252U, 252U, 252U, 252U,
+ 237U, 240U, 0U, 0U
+ };
+
+ // Numeric constants.
+ const uint8x16_t n51 = vdupq_n_u8(51);
+ const uint8x16_t n25 = vdupq_n_u8(25);
+ const uint8x16_t n63 = vdupq_n_u8(63);
+
+ __asm__ (
+
+ // Load 48 bytes and deinterleave. The bytes are loaded to
+ // hard-coded registers q12, q13 and q14, to ensure that they
+ // are contiguous. Increment the source pointer.
+ "vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
+ "vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
+
+ // Reshuffle the bytes using temporaries.
+ "vshr.u8 %q[t0], q12, #2 \n\t"
+ "vshr.u8 %q[t1], q13, #4 \n\t"
+ "vshr.u8 %q[t2], q14, #6 \n\t"
+ "vsli.8 %q[t1], q12, #4 \n\t"
+ "vsli.8 %q[t2], q13, #2 \n\t"
+ "vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
+ "vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
+ "vand.u8 %q[t3], q14, %q[n63] \n\t"
+
+ // t0..t3 are the reshuffled inputs. Create LUT indices.
+ "vqsub.u8 q12, %q[t0], %q[n51] \n\t"
+ "vqsub.u8 q13, %q[t1], %q[n51] \n\t"
+ "vqsub.u8 q14, %q[t2], %q[n51] \n\t"
+ "vqsub.u8 q15, %q[t3], %q[n51] \n\t"
+
+ // Create the mask for range #0.
+ "vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
+ "vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
+ "vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
+ "vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
+
+ // Subtract -1 to correct the LUT indices.
+ "vsub.u8 q12, %q[m0] \n\t"
+ "vsub.u8 q13, %q[m1] \n\t"
+ "vsub.u8 q14, %q[m2] \n\t"
+ "vsub.u8 q15, %q[m3] \n\t"
+
+ // Lookup the delta values.
+ "vtbl.u8 d24, {%q[lut]}, d24 \n\t"
+ "vtbl.u8 d25, {%q[lut]}, d25 \n\t"
+ "vtbl.u8 d26, {%q[lut]}, d26 \n\t"
+ "vtbl.u8 d27, {%q[lut]}, d27 \n\t"
+ "vtbl.u8 d28, {%q[lut]}, d28 \n\t"
+ "vtbl.u8 d29, {%q[lut]}, d29 \n\t"
+ "vtbl.u8 d30, {%q[lut]}, d30 \n\t"
+ "vtbl.u8 d31, {%q[lut]}, d31 \n\t"
+
+ // Add the delta values.
+ "vadd.u8 q12, %q[t0] \n\t"
+ "vadd.u8 q13, %q[t1] \n\t"
+ "vadd.u8 q14, %q[t2] \n\t"
+ "vadd.u8 q15, %q[t3] \n\t"
+
+ // Store 64 bytes and interleave. Increment the dest pointer.
+ "vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
+ "vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
+
+ // Outputs (modified).
+ : [src] "+r" (*s),
+ [dst] "+r" (*o),
+ [t0] "=&w" (tmp0),
+ [t1] "=&w" (tmp1),
+ [t2] "=&w" (tmp2),
+ [t3] "=&w" (tmp3),
+ [m0] "=&w" (mask0),
+ [m1] "=&w" (mask1),
+ [m2] "=&w" (mask2),
+ [m3] "=&w" (mask3)
+
+ // Inputs (not modified).
+ : [lut] "w" (lut),
+ [n25] "w" (n25),
+ [n51] "w" (n51),
+ [n63] "w" (n63)
+
+ // Clobbers.
+ : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
+ );
+}
+#endif
+
+static inline void
+enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
+{
+#ifdef BASE64_NEON32_USE_ASM
+ enc_loop_neon32_inner_asm(s, o);
+#else
+ // Load 48 bytes and deinterleave:
+ uint8x16x3_t src = vld3q_u8(*s);
+
+ // Reshuffle:
+ uint8x16x4_t out = enc_reshuffle(src);
+
+ // Translate reshuffled bytes to the Base64 alphabet:
+ out = enc_translate(out);
+
+ // Interleave and store output:
+ vst4q_u8(*o, out);
+
+ *s += 48;
+ *o += 64;
+#endif
+}
+
+static inline void
+enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+ size_t rounds = *slen / 48;
+
+ *slen -= rounds * 48; // 48 bytes consumed per round
+ *olen += rounds * 64; // 64 bytes produced per round
+
+ while (rounds > 0) {
+ if (rounds >= 8) {
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ rounds -= 8;
+ continue;
+ }
+ if (rounds >= 4) {
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ rounds -= 4;
+ continue;
+ }
+ if (rounds >= 2) {
+ enc_loop_neon32_inner(s, o);
+ enc_loop_neon32_inner(s, o);
+ rounds -= 2;
+ continue;
+ }
+ enc_loop_neon32_inner(s, o);
+ break;
+ }
+}