static inline int dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const __m256i lut_lo = _mm256_setr_epi8( 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A); const __m256i lut_hi = _mm256_setr_epi8( 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10); const __m256i lut_roll = _mm256_setr_epi8( 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0); const __m256i mask_2F = _mm256_set1_epi8(0x2F); // Load input: __m256i str = _mm256_loadu_si256((__m256i *) *s); // See the SSSE3 decoder for an explanation of the algorithm. const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F); const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); if (!_mm256_testz_si256(lo, hi)) { return 0; } const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); // Now simply add the delta values to the input: str = _mm256_add_epi8(str, roll); // Reshuffle the input to packed 12-byte output format: str = dec_reshuffle(str); // Store the output: _mm256_storeu_si256((__m256i *) *o, str); *s += 32; *o += 24; *rounds -= 1; return 1; } static inline void dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { if (*slen < 45) { return; } // Process blocks of 32 bytes per round. Because 8 extra zero bytes are // written after the output, ensure that there will be at least 13 // bytes of input data left to cover the gap. (11 data bytes and up to // two end-of-string markers.) size_t rounds = (*slen - 13) / 32; *slen -= rounds * 32; // 32 bytes consumed per round *olen += rounds * 24; // 24 bytes produced per round do { if (rounds >= 8) { if (dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds)) { continue; } break; } if (rounds >= 4) { if (dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds)) { continue; } break; } if (rounds >= 2) { if (dec_loop_avx2_inner(s, o, &rounds) && dec_loop_avx2_inner(s, o, &rounds)) { continue; } break; } dec_loop_avx2_inner(s, o, &rounds); break; } while (rounds > 0); // Adjust for any rounds that were skipped: *slen += rounds * 32; *olen -= rounds * 24; }