#ifdef BASE64_NEON32_USE_ASM static inline void enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) { // This function duplicates the functionality of enc_loop_neon32_inner, // but entirely with inline assembly. This gives a significant speedup // over using NEON intrinsics, which do not always generate very good // code. The logic of the assembly is directly lifted from the // intrinsics version, so it can be used as a guide to this code. // Temporary registers, used as scratch space. uint8x16_t tmp0, tmp1, tmp2, tmp3; uint8x16_t mask0, mask1, mask2, mask3; // A lookup table containing the absolute offsets for all ranges. const uint8x16_t lut = { 65U, 71U, 252U, 252U, 252U, 252U, 252U, 252U, 252U, 252U, 252U, 252U, 237U, 240U, 0U, 0U }; // Numeric constants. const uint8x16_t n51 = vdupq_n_u8(51); const uint8x16_t n25 = vdupq_n_u8(25); const uint8x16_t n63 = vdupq_n_u8(63); __asm__ ( // Load 48 bytes and deinterleave. The bytes are loaded to // hard-coded registers q12, q13 and q14, to ensure that they // are contiguous. Increment the source pointer. "vld3.8 {d24, d26, d28}, [%[src]]! \n\t" "vld3.8 {d25, d27, d29}, [%[src]]! \n\t" // Reshuffle the bytes using temporaries. "vshr.u8 %q[t0], q12, #2 \n\t" "vshr.u8 %q[t1], q13, #4 \n\t" "vshr.u8 %q[t2], q14, #6 \n\t" "vsli.8 %q[t1], q12, #4 \n\t" "vsli.8 %q[t2], q13, #2 \n\t" "vand.u8 %q[t1], %q[t1], %q[n63] \n\t" "vand.u8 %q[t2], %q[t2], %q[n63] \n\t" "vand.u8 %q[t3], q14, %q[n63] \n\t" // t0..t3 are the reshuffled inputs. Create LUT indices. "vqsub.u8 q12, %q[t0], %q[n51] \n\t" "vqsub.u8 q13, %q[t1], %q[n51] \n\t" "vqsub.u8 q14, %q[t2], %q[n51] \n\t" "vqsub.u8 q15, %q[t3], %q[n51] \n\t" // Create the mask for range #0. "vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t" "vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t" "vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t" "vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t" // Subtract -1 to correct the LUT indices. "vsub.u8 q12, %q[m0] \n\t" "vsub.u8 q13, %q[m1] \n\t" "vsub.u8 q14, %q[m2] \n\t" "vsub.u8 q15, %q[m3] \n\t" // Lookup the delta values. "vtbl.u8 d24, {%q[lut]}, d24 \n\t" "vtbl.u8 d25, {%q[lut]}, d25 \n\t" "vtbl.u8 d26, {%q[lut]}, d26 \n\t" "vtbl.u8 d27, {%q[lut]}, d27 \n\t" "vtbl.u8 d28, {%q[lut]}, d28 \n\t" "vtbl.u8 d29, {%q[lut]}, d29 \n\t" "vtbl.u8 d30, {%q[lut]}, d30 \n\t" "vtbl.u8 d31, {%q[lut]}, d31 \n\t" // Add the delta values. "vadd.u8 q12, %q[t0] \n\t" "vadd.u8 q13, %q[t1] \n\t" "vadd.u8 q14, %q[t2] \n\t" "vadd.u8 q15, %q[t3] \n\t" // Store 64 bytes and interleave. Increment the dest pointer. "vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t" "vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t" // Outputs (modified). : [src] "+r" (*s), [dst] "+r" (*o), [t0] "=&w" (tmp0), [t1] "=&w" (tmp1), [t2] "=&w" (tmp2), [t3] "=&w" (tmp3), [m0] "=&w" (mask0), [m1] "=&w" (mask1), [m2] "=&w" (mask2), [m3] "=&w" (mask3) // Inputs (not modified). : [lut] "w" (lut), [n25] "w" (n25), [n51] "w" (n51), [n63] "w" (n63) // Clobbers. : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" ); } #endif static inline void enc_loop_neon32_inner (const uint8_t **s, uint8_t **o) { #ifdef BASE64_NEON32_USE_ASM enc_loop_neon32_inner_asm(s, o); #else // Load 48 bytes and deinterleave: uint8x16x3_t src = vld3q_u8(*s); // Reshuffle: uint8x16x4_t out = enc_reshuffle(src); // Translate reshuffled bytes to the Base64 alphabet: out = enc_translate(out); // Interleave and store output: vst4q_u8(*o, out); *s += 48; *o += 64; #endif } static inline void enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { size_t rounds = *slen / 48; *slen -= rounds * 48; // 48 bytes consumed per round *olen += rounds * 64; // 64 bytes produced per round while (rounds > 0) { if (rounds >= 8) { enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); rounds -= 8; continue; } if (rounds >= 4) { enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); rounds -= 4; continue; } if (rounds >= 2) { enc_loop_neon32_inner(s, o); enc_loop_neon32_inner(s, o); rounds -= 2; continue; } enc_loop_neon32_inner(s, o); break; } }