diff options
Diffstat (limited to 'src/libFLAC/deduplication')
4 files changed, 308 insertions, 0 deletions
diff --git a/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c b/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c new file mode 100644 index 0000000..75ed47f --- /dev/null +++ b/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c @@ -0,0 +1,143 @@ +{ + /* try and get br->consumed_words and br->consumed_bits into register; + * must remember to flush them back to *br before calling other + * bitreader functions that use them, and before returning */ + uint32_t cwords, words, lsbs, msbs, x, y, limit; + uint32_t ucbits; /* keep track of the number of unconsumed bits in word */ + brword b; + int *val, *end; + + FLAC__ASSERT(0 != br); + FLAC__ASSERT(0 != br->buffer); + /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */ + FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32); + FLAC__ASSERT(parameter < 32); + /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */ + + limit = UINT32_MAX >> parameter; /* Maximal msbs that can occur with residual bounded to int32_t */ + + val = vals; + end = vals + nvals; + + if(parameter == 0) { + while(val < end) { + /* read the unary MSBs and end bit */ + if(!FLAC__bitreader_read_unary_unsigned(br, &msbs)) + return false; + /* Checking limit here would be overzealous: coding UINT32_MAX + * with parameter == 0 would take 4GiB */ + *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1); + } + + return true; + } + + FLAC__ASSERT(parameter > 0); + + cwords = br->consumed_words; + words = br->words; + + /* if we've not consumed up to a partial tail word... */ + if(cwords >= words) { + x = 0; + goto process_tail; + } + + ucbits = FLAC__BITS_PER_WORD - br->consumed_bits; + b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed bits aligned to left */ + + while(val < end) { + /* read the unary MSBs and end bit */ + x = y = COUNT_ZERO_MSBS2(b); + if(x == FLAC__BITS_PER_WORD) { + x = ucbits; + do { + /* didn't find stop bit yet, have to keep going... */ + cwords++; + if (cwords >= words) + goto incomplete_msbs; + b = br->buffer[cwords]; + y = COUNT_ZERO_MSBS2(b); + x += y; + } while(y == FLAC__BITS_PER_WORD); + } + b <<= y; + b <<= 1; /* account for stop bit */ + ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD; + msbs = x; + + if(x > limit) + return false; + + /* read the binary LSBs */ + x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */ + if(parameter <= ucbits) { + ucbits -= parameter; + b <<= parameter; + } else { + /* there are still bits left to read, they will all be in the next word */ + cwords++; + if (cwords >= words) + goto incomplete_lsbs; + b = br->buffer[cwords]; + ucbits += FLAC__BITS_PER_WORD - parameter; + x |= (FLAC__uint32)(b >> ucbits); + b <<= FLAC__BITS_PER_WORD - ucbits; + } + lsbs = x; + + /* compose the value */ + x = (msbs << parameter) | lsbs; + *val++ = (int)(x >> 1) ^ -(int)(x & 1); + + continue; + + /* at this point we've eaten up all the whole words */ +process_tail: + do { + if(0) { +incomplete_msbs: + br->consumed_bits = 0; + br->consumed_words = cwords; + } + + /* read the unary MSBs and end bit */ + if(!FLAC__bitreader_read_unary_unsigned(br, &msbs)) + return false; + msbs += x; + x = ucbits = 0; + + if(0) { +incomplete_lsbs: + br->consumed_bits = 0; + br->consumed_words = cwords; + } + + /* read the binary LSBs */ + if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits)) + return false; + lsbs = x | lsbs; + + /* compose the value */ + x = (msbs << parameter) | lsbs; + *val++ = (int)(x >> 1) ^ -(int)(x & 1); + x = 0; + + cwords = br->consumed_words; + words = br->words; + ucbits = FLAC__BITS_PER_WORD - br->consumed_bits; + b = cwords < br->capacity ? br->buffer[cwords] << br->consumed_bits : 0; + } while(cwords >= words && val < end); + } + + if(ucbits == 0 && cwords < words) { + /* don't leave the head word with no unconsumed bits */ + cwords++; + ucbits = FLAC__BITS_PER_WORD; + } + + br->consumed_bits = FLAC__BITS_PER_WORD - ucbits; + br->consumed_words = cwords; + + return true; +} diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c new file mode 100644 index 0000000..76419db --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c @@ -0,0 +1,14 @@ + int i, j; + (void) lag; + FLAC__ASSERT(lag <= MAX_LAG); + + for(i = 0; i < MAX_LAG; i++) + autoc[i] = 0.0; + + for(i = 0; i < MAX_LAG; i++) + for(j = 0; j <= i; j++) + autoc[j] += (double)data[i] * (double)data[i-j]; + + for(i = MAX_LAG; i < (int)data_len; i++) + for(j = 0; j < MAX_LAG; j++) + autoc[j] += (double)data[i] * (double)data[i-j]; diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c new file mode 100644 index 0000000..4df3aee --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c @@ -0,0 +1,70 @@ + int i; + float64x2_t sum0 = vdupq_n_f64(0.0f); + float64x2_t sum1 = vdupq_n_f64(0.0f); + float64x2_t sum2 = vdupq_n_f64(0.0f); + float64x2_t sum3 = vdupq_n_f64(0.0f); + float64x2_t d0 = vdupq_n_f64(0.0f); + float64x2_t d1 = vdupq_n_f64(0.0f); + float64x2_t d2 = vdupq_n_f64(0.0f); + float64x2_t d3 = vdupq_n_f64(0.0f); +#if MAX_LAG > 8 + float64x2_t sum4 = vdupq_n_f64(0.0f); + float64x2_t d4 = vdupq_n_f64(0.0f); +#endif +#if MAX_LAG > 10 + float64x2_t sum5 = vdupq_n_f64(0.0f); + float64x2_t sum6 = vdupq_n_f64(0.0f); + float64x2_t d5 = vdupq_n_f64(0.0f); + float64x2_t d6 = vdupq_n_f64(0.0f); +#endif + float64x2_t d; + + (void)lag; + FLAC__ASSERT(lag <= MAX_LAG); + + // Loop backwards through samples from data_len to 0 + for (i = data_len - 1; i >= 0; i--) + { + d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i] + + // The next 6 lines of code right-shift the elements through the 7 vectors d0..d6. + // The 7th line adds the newly loaded element to d0. This works like a stack, where + // data[i] is pushed onto the stack every time and the 9th element falls off +#if MAX_LAG > 10 + d6 = vextq_f64(d5,d6,1); + d5 = vextq_f64(d4,d5,1); +#endif +#if MAX_LAG > 8 + d4 = vextq_f64(d3,d4,1); +#endif + d3 = vextq_f64(d2,d3,1); + d2 = vextq_f64(d1,d2,1); + d1 = vextq_f64(d0,d1,1); + d0 = vextq_f64(d,d0,1); + + // Fused multiply-add sum += d * d0..d6 + sum0 = vfmaq_f64(sum0, d, d0); + sum1 = vfmaq_f64(sum1, d, d1); + sum2 = vfmaq_f64(sum2, d, d2); + sum3 = vfmaq_f64(sum3, d, d3); +#if MAX_LAG > 8 + sum4 = vfmaq_f64(sum4, d, d4); +#endif +#if MAX_LAG > 10 + sum5 = vfmaq_f64(sum5, d, d5); + sum6 = vfmaq_f64(sum6, d, d6); +#endif + } + + // Store sum0..sum6 in autoc[0..14] + vst1q_f64(autoc, sum0); + vst1q_f64(autoc + 2, sum1); + vst1q_f64(autoc + 4, sum2); + vst1q_f64(autoc + 6, sum3); +#if MAX_LAG > 8 + vst1q_f64(autoc + 8, sum4); +#endif +#if MAX_LAG > 10 + vst1q_f64(autoc + 10, sum5); + vst1q_f64(autoc + 12, sum6); +#endif diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c new file mode 100644 index 0000000..607b42f --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c @@ -0,0 +1,81 @@ +/* This code is imported several times in lpc_intrin_sse2.c with different + * values for MAX_LAG. Comments are for MAX_LAG == 14 */ + int i; + __m128d sum0, sum1, sum2, sum3; + __m128d d0, d1, d2, d3; +#if MAX_LAG > 8 + __m128d d4; + __m128d sum4; +#endif +#if MAX_LAG > 10 + __m128d d5, d6; + __m128d sum5, sum6; +#endif + + (void) lag; + FLAC__ASSERT(lag <= MAX_LAG); + + /* Initialize all sum vectors with zero */ + sum0 = _mm_setzero_pd(); + sum1 = _mm_setzero_pd(); + sum2 = _mm_setzero_pd(); + sum3 = _mm_setzero_pd(); + d0 = _mm_setzero_pd(); + d1 = _mm_setzero_pd(); + d2 = _mm_setzero_pd(); + d3 = _mm_setzero_pd(); +#if MAX_LAG > 8 + sum4 = _mm_setzero_pd(); + d4 = _mm_setzero_pd(); +#endif +#if MAX_LAG > 10 + sum5 = _mm_setzero_pd(); + sum6 = _mm_setzero_pd(); + d5 = _mm_setzero_pd(); + d6 = _mm_setzero_pd(); +#endif + + /* Loop backwards through samples from data_len to limit */ + for(i = data_len-1; i >= 0; i--) { + __m128d d = _mm_set1_pd(data[i]); + + /* The next lines of code work like a queue. For more + * information see the lag8 version of this function */ +#if MAX_LAG > 10 + d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1)); + d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1)); +#endif +#if MAX_LAG > 8 + d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1)); +#endif + d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); + d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); + d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); + d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); + + /* sumn += d*dn */ + sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0)); + sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1)); + sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2)); + sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3)); +#if MAX_LAG > 8 + sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4)); +#endif +#if MAX_LAG > 10 + sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5)); + sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6)); +#endif + } + + /* Store sum0..sum6 in autoc[0..14] */ + _mm_storeu_pd(autoc, sum0); + _mm_storeu_pd(autoc+2, sum1); + _mm_storeu_pd(autoc+4, sum2); + _mm_storeu_pd(autoc+6 ,sum3); +#if MAX_LAG > 8 + _mm_storeu_pd(autoc+8, sum4); +#endif +#if MAX_LAG > 10 + _mm_storeu_pd(autoc+10,sum5); + _mm_storeu_pd(autoc+12,sum6); +#endif |