4 files changed, 308 insertions, 0 deletions
diff --git a/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c b/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c
new file mode 100644
index 0000000..75ed47f
--- /dev/null
+++ b/src/libFLAC/deduplication/bitreader_read_rice_signed_block.c
@@ -0,0 +1,143 @@
+{
+	/* try and get br->consumed_words and br->consumed_bits into register;
+	 * must remember to flush them back to *br before calling other
+	 * bitreader functions that use them, and before returning */
+	uint32_t cwords, words, lsbs, msbs, x, y, limit;
+	uint32_t ucbits; /* keep track of the number of unconsumed bits in word */
+	brword b;
+	int *val, *end;
+
+	FLAC__ASSERT(0 != br);
+	FLAC__ASSERT(0 != br->buffer);
+	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
+	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
+	FLAC__ASSERT(parameter < 32);
+	/* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
+
+	limit = UINT32_MAX >> parameter; /* Maximal msbs that can occur with residual bounded to int32_t */
+
+	val = vals;
+	end = vals + nvals;
+
+	if(parameter == 0) {
+		while(val < end) {
+			/* read the unary MSBs and end bit */
+			if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
+				return false;
+			/* Checking limit here would be overzealous: coding UINT32_MAX
+			 * with parameter == 0 would take 4GiB */
+			*val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
+		}
+
+		return true;
+	}
+
+	FLAC__ASSERT(parameter > 0);
+
+	cwords = br->consumed_words;
+	words = br->words;
+
+	/* if we've not consumed up to a partial tail word... */
+	if(cwords >= words) {
+		x = 0;
+		goto process_tail;
+	}
+
+	ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+	b = br->buffer[cwords] << br->consumed_bits;  /* keep unconsumed bits aligned to left */
+
+	while(val < end) {
+		/* read the unary MSBs and end bit */
+		x = y = COUNT_ZERO_MSBS2(b);
+		if(x == FLAC__BITS_PER_WORD) {
+			x = ucbits;
+			do {
+				/* didn't find stop bit yet, have to keep going... */
+				cwords++;
+				if (cwords >= words)
+					goto incomplete_msbs;
+				b = br->buffer[cwords];
+				y = COUNT_ZERO_MSBS2(b);
+				x += y;
+			} while(y == FLAC__BITS_PER_WORD);
+		}
+		b <<= y;
+		b <<= 1; /* account for stop bit */
+		ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
+		msbs = x;
+
+		if(x > limit)
+			return false;
+
+		/* read the binary LSBs */
+		x = (FLAC__uint32)(b >> (FLAC__BITS_PER_WORD - parameter)); /* parameter < 32, so we can cast to 32-bit uint32_t */
+		if(parameter <= ucbits) {
+			ucbits -= parameter;
+			b <<= parameter;
+		} else {
+			/* there are still bits left to read, they will all be in the next word */
+			cwords++;
+			if (cwords >= words)
+				goto incomplete_lsbs;
+			b = br->buffer[cwords];
+			ucbits += FLAC__BITS_PER_WORD - parameter;
+			x |= (FLAC__uint32)(b >> ucbits);
+			b <<= FLAC__BITS_PER_WORD - ucbits;
+		}
+		lsbs = x;
+
+		/* compose the value */
+		x = (msbs << parameter) | lsbs;
+		*val++ = (int)(x >> 1) ^ -(int)(x & 1);
+
+		continue;
+
+		/* at this point we've eaten up all the whole words */
+process_tail:
+		do {
+			if(0) {
+incomplete_msbs:
+				br->consumed_bits = 0;
+				br->consumed_words = cwords;
+			}
+
+			/* read the unary MSBs and end bit */
+			if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
+				return false;
+			msbs += x;
+			x = ucbits = 0;
+
+			if(0) {
+incomplete_lsbs:
+				br->consumed_bits = 0;
+				br->consumed_words = cwords;
+			}
+
+			/* read the binary LSBs */
+			if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
+				return false;
+			lsbs = x | lsbs;
+
+			/* compose the value */
+			x = (msbs << parameter) | lsbs;
+			*val++ = (int)(x >> 1) ^ -(int)(x & 1);
+			x = 0;
+
+			cwords = br->consumed_words;
+			words = br->words;
+			ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+			b = cwords < br->capacity ? br->buffer[cwords] << br->consumed_bits : 0;
+		} while(cwords >= words && val < end);
+	}
+
+	if(ucbits == 0 && cwords < words) {
+		/* don't leave the head word with no unconsumed bits */
+		cwords++;
+		ucbits = FLAC__BITS_PER_WORD;
+	}
+
+	br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
+	br->consumed_words = cwords;
+
+	return true;
+}
diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c
new file mode 100644
index 0000000..76419db
--- /dev/null
+++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c
@@ -0,0 +1,14 @@
+	int i, j;
+	(void) lag;
+	FLAC__ASSERT(lag <= MAX_LAG);
+
+        for(i = 0; i < MAX_LAG; i++)
+                autoc[i] = 0.0;
+
+        for(i = 0; i < MAX_LAG; i++)
+                for(j = 0; j <= i; j++)
+                        autoc[j] += (double)data[i] * (double)data[i-j];
+
+        for(i = MAX_LAG; i < (int)data_len; i++)
+		for(j = 0; j < MAX_LAG; j++)
+	                autoc[j] += (double)data[i] * (double)data[i-j];
diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c
new file mode 100644
index 0000000..4df3aee
--- /dev/null
+++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c
@@ -0,0 +1,70 @@
+	int i;
+	float64x2_t sum0 = vdupq_n_f64(0.0f);
+	float64x2_t sum1 = vdupq_n_f64(0.0f);
+	float64x2_t sum2 = vdupq_n_f64(0.0f);
+	float64x2_t sum3 = vdupq_n_f64(0.0f);
+	float64x2_t d0 = vdupq_n_f64(0.0f);
+	float64x2_t d1 = vdupq_n_f64(0.0f);
+	float64x2_t d2 = vdupq_n_f64(0.0f);
+	float64x2_t d3 = vdupq_n_f64(0.0f);
+#if MAX_LAG > 8
+	float64x2_t sum4 = vdupq_n_f64(0.0f);
+	float64x2_t d4 = vdupq_n_f64(0.0f);
+#endif
+#if MAX_LAG > 10
+	float64x2_t sum5 = vdupq_n_f64(0.0f);
+	float64x2_t sum6 = vdupq_n_f64(0.0f);
+	float64x2_t d5 = vdupq_n_f64(0.0f);
+	float64x2_t d6 = vdupq_n_f64(0.0f);
+#endif
+	float64x2_t d;
+
+	(void)lag;
+	FLAC__ASSERT(lag <= MAX_LAG);
+
+	// Loop backwards through samples from data_len to 0
+	for (i = data_len - 1; i >= 0; i--)
+	{
+		d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i]
+
+		// The next 6 lines of code right-shift the elements through the 7 vectors d0..d6.
+		// The 7th line adds the newly loaded element to d0. This works like a stack, where
+		// data[i] is pushed onto the stack every time and the 9th element falls off
+#if MAX_LAG > 10
+		d6 = vextq_f64(d5,d6,1);
+		d5 = vextq_f64(d4,d5,1);
+#endif
+#if MAX_LAG > 8
+		d4 = vextq_f64(d3,d4,1);
+#endif
+		d3 = vextq_f64(d2,d3,1);
+		d2 = vextq_f64(d1,d2,1);
+		d1 = vextq_f64(d0,d1,1);
+		d0 = vextq_f64(d,d0,1);
+
+		// Fused multiply-add sum += d * d0..d6
+		sum0 = vfmaq_f64(sum0, d, d0);
+		sum1 = vfmaq_f64(sum1, d, d1);
+		sum2 = vfmaq_f64(sum2, d, d2);
+		sum3 = vfmaq_f64(sum3, d, d3);
+#if MAX_LAG > 8
+		sum4 = vfmaq_f64(sum4, d, d4);
+#endif
+#if MAX_LAG > 10
+		sum5 = vfmaq_f64(sum5, d, d5);
+		sum6 = vfmaq_f64(sum6, d, d6);
+#endif
+	}
+
+    // Store sum0..sum6 in autoc[0..14]
+    vst1q_f64(autoc, sum0);
+    vst1q_f64(autoc + 2, sum1);
+    vst1q_f64(autoc + 4, sum2);
+    vst1q_f64(autoc + 6, sum3);
+#if MAX_LAG > 8
+    vst1q_f64(autoc + 8, sum4);
+#endif
+#if MAX_LAG > 10
+    vst1q_f64(autoc + 10, sum5);
+    vst1q_f64(autoc + 12, sum6);
+#endif
diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c
new file mode 100644
index 0000000..607b42f
--- /dev/null
+++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c
@@ -0,0 +1,81 @@
+/* This code is imported several times in lpc_intrin_sse2.c with different
+ * values for MAX_LAG. Comments are for MAX_LAG == 14 */
+	int i;
+	__m128d sum0, sum1, sum2, sum3;
+	__m128d d0, d1, d2, d3;
+#if MAX_LAG > 8
+	__m128d d4;
+	__m128d sum4;
+#endif
+#if MAX_LAG > 10
+	__m128d d5, d6;
+	__m128d sum5, sum6;
+#endif
+
+	(void) lag;
+	FLAC__ASSERT(lag <= MAX_LAG);
+
+	/* Initialize all sum vectors with zero */
+	sum0 = _mm_setzero_pd();
+	sum1 = _mm_setzero_pd();
+	sum2 = _mm_setzero_pd();
+	sum3 = _mm_setzero_pd();
+	d0 = _mm_setzero_pd();
+	d1 = _mm_setzero_pd();
+	d2 = _mm_setzero_pd();
+	d3 = _mm_setzero_pd();
+#if MAX_LAG > 8
+	sum4 = _mm_setzero_pd();
+	d4 = _mm_setzero_pd();
+#endif
+#if MAX_LAG > 10
+	sum5 = _mm_setzero_pd();
+	sum6 = _mm_setzero_pd();
+	d5 = _mm_setzero_pd();
+	d6 = _mm_setzero_pd();
+#endif
+
+	/* Loop backwards through samples from data_len to limit */
+	for(i = data_len-1; i >= 0; i--) {
+		__m128d d = _mm_set1_pd(data[i]);
+
+		/* The next lines of code work like a queue. For more
+		 * information see the lag8 version of this function */
+#if MAX_LAG > 10
+		d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
+		d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
+#endif
+#if MAX_LAG > 8
+		d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
+#endif
+		d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
+		d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
+		d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
+		d0 = _mm_shuffle_pd(d,  d0, _MM_SHUFFLE(0,0,0,1));
+
+		/* sumn += d*dn */
+		sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
+		sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
+		sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
+		sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
+#if MAX_LAG > 8
+		sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
+#endif
+#if MAX_LAG > 10
+		sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
+		sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
+#endif
+	}
+
+	/* Store sum0..sum6 in autoc[0..14] */
+	_mm_storeu_pd(autoc,   sum0);
+	_mm_storeu_pd(autoc+2, sum1);
+	_mm_storeu_pd(autoc+4, sum2);
+	_mm_storeu_pd(autoc+6 ,sum3);
+#if MAX_LAG > 8
+	_mm_storeu_pd(autoc+8, sum4);
+#endif
+#if MAX_LAG > 10
+	_mm_storeu_pd(autoc+10,sum5);
+	_mm_storeu_pd(autoc+12,sum6);
+#endif