1 files changed, 363 insertions, 0 deletions
diff --git a/dom/media/webaudio/AudioNodeEngineSSE2.cpp b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
new file mode 100644
index 0000000000..18bc1484c3
--- /dev/null
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
@@ -0,0 +1,363 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "AudioNodeEngineSSE2.h"
+#include "AlignmentUtils.h"
+#include <emmintrin.h>
+
+namespace mozilla {
+void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
+                                 float* aOutput, uint32_t aSize) {
+  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
+      vout1, vout2, vout3, vgain;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aOutput);
+  ASSERT_MULTIPLE16(aSize);
+
+  vgain = _mm_load1_ps(&aScale);
+
+  for (unsigned i = 0; i < aSize; i += 16) {
+    vin0 = _mm_load_ps(&aInput[i]);
+    vin1 = _mm_load_ps(&aInput[i + 4]);
+    vin2 = _mm_load_ps(&aInput[i + 8]);
+    vin3 = _mm_load_ps(&aInput[i + 12]);
+
+    vscaled0 = _mm_mul_ps(vin0, vgain);
+    vscaled1 = _mm_mul_ps(vin1, vgain);
+    vscaled2 = _mm_mul_ps(vin2, vgain);
+    vscaled3 = _mm_mul_ps(vin3, vgain);
+
+    vin0 = _mm_load_ps(&aOutput[i]);
+    vin1 = _mm_load_ps(&aOutput[i + 4]);
+    vin2 = _mm_load_ps(&aOutput[i + 8]);
+    vin3 = _mm_load_ps(&aOutput[i + 12]);
+
+    vout0 = _mm_add_ps(vin0, vscaled0);
+    vout1 = _mm_add_ps(vin1, vscaled1);
+    vout2 = _mm_add_ps(vin2, vscaled2);
+    vout3 = _mm_add_ps(vin3, vscaled3);
+
+    _mm_store_ps(&aOutput[i], vout0);
+    _mm_store_ps(&aOutput[i + 4], vout1);
+    _mm_store_ps(&aOutput[i + 8], vout2);
+    _mm_store_ps(&aOutput[i + 12], vout3);
+  }
+}
+
+void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
+                                        float* aOutput) {
+  __m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aOutput);
+
+  __m128 vgain = _mm_load1_ps(&aScale);
+
+  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
+    vin0 = _mm_load_ps(&aInput[i]);
+    vin1 = _mm_load_ps(&aInput[i + 4]);
+    vin2 = _mm_load_ps(&aInput[i + 8]);
+    vin3 = _mm_load_ps(&aInput[i + 12]);
+    vout0 = _mm_mul_ps(vin0, vgain);
+    vout1 = _mm_mul_ps(vin1, vgain);
+    vout2 = _mm_mul_ps(vin2, vgain);
+    vout3 = _mm_mul_ps(vin3, vgain);
+    _mm_store_ps(&aOutput[i], vout0);
+    _mm_store_ps(&aOutput[i + 4], vout1);
+    _mm_store_ps(&aOutput[i + 8], vout2);
+    _mm_store_ps(&aOutput[i + 12], vout3);
+  }
+}
+
+void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
+                                        const float aScale[WEBAUDIO_BLOCK_SIZE],
+                                        float aOutput[WEBAUDIO_BLOCK_SIZE]) {
+  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
+      vout1, vout2, vout3;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aScale);
+  ASSERT_ALIGNED16(aOutput);
+
+  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
+    vscaled0 = _mm_load_ps(&aScale[i]);
+    vscaled1 = _mm_load_ps(&aScale[i + 4]);
+    vscaled2 = _mm_load_ps(&aScale[i + 8]);
+    vscaled3 = _mm_load_ps(&aScale[i + 12]);
+
+    vin0 = _mm_load_ps(&aInput[i]);
+    vin1 = _mm_load_ps(&aInput[i + 4]);
+    vin2 = _mm_load_ps(&aInput[i + 8]);
+    vin3 = _mm_load_ps(&aInput[i + 12]);
+
+    vout0 = _mm_mul_ps(vin0, vscaled0);
+    vout1 = _mm_mul_ps(vin1, vscaled1);
+    vout2 = _mm_mul_ps(vin2, vscaled2);
+    vout3 = _mm_mul_ps(vin3, vscaled3);
+
+    _mm_store_ps(&aOutput[i], vout0);
+    _mm_store_ps(&aOutput[i + 4], vout1);
+    _mm_store_ps(&aOutput[i + 8], vout2);
+    _mm_store_ps(&aOutput[i + 12], vout3);
+  }
+}
+
+void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize) {
+  __m128 vout0, vout1, vout2, vout3, vin0, vin1, vin2, vin3;
+
+  ASSERT_ALIGNED16(aBlock);
+  ASSERT_MULTIPLE16(aSize);
+
+  __m128 vgain = _mm_load1_ps(&aScale);
+
+  for (unsigned i = 0; i < aSize; i += 16) {
+    vin0 = _mm_load_ps(&aBlock[i]);
+    vin1 = _mm_load_ps(&aBlock[i + 4]);
+    vin2 = _mm_load_ps(&aBlock[i + 8]);
+    vin3 = _mm_load_ps(&aBlock[i + 12]);
+    vout0 = _mm_mul_ps(vin0, vgain);
+    vout1 = _mm_mul_ps(vin1, vgain);
+    vout2 = _mm_mul_ps(vin2, vgain);
+    vout3 = _mm_mul_ps(vin3, vgain);
+    _mm_store_ps(&aBlock[i], vout0);
+    _mm_store_ps(&aBlock[i + 4], vout1);
+    _mm_store_ps(&aBlock[i + 8], vout2);
+    _mm_store_ps(&aBlock[i + 12], vout3);
+  }
+}
+
+void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize) {
+  __m128 vout0, vout1, vout2, vout3, vgain0, vgain1, vgain2, vgain3, vin0, vin1,
+      vin2, vin3;
+
+  ASSERT_ALIGNED16(aBlock);
+  ASSERT_MULTIPLE16(aSize);
+
+  for (unsigned i = 0; i < aSize; i += 16) {
+    vin0 = _mm_load_ps(&aBlock[i]);
+    vin1 = _mm_load_ps(&aBlock[i + 4]);
+    vin2 = _mm_load_ps(&aBlock[i + 8]);
+    vin3 = _mm_load_ps(&aBlock[i + 12]);
+    vgain0 = _mm_load_ps(&aScale[i]);
+    vgain1 = _mm_load_ps(&aScale[i + 4]);
+    vgain2 = _mm_load_ps(&aScale[i + 8]);
+    vgain3 = _mm_load_ps(&aScale[i + 12]);
+    vout0 = _mm_mul_ps(vin0, vgain0);
+    vout1 = _mm_mul_ps(vin1, vgain1);
+    vout2 = _mm_mul_ps(vin2, vgain2);
+    vout3 = _mm_mul_ps(vin3, vgain3);
+    _mm_store_ps(&aBlock[i], vout0);
+    _mm_store_ps(&aBlock[i + 4], vout1);
+    _mm_store_ps(&aBlock[i + 8], vout2);
+    _mm_store_ps(&aBlock[i + 12], vout3);
+  }
+}
+
+void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
+                                     const float aInputR[WEBAUDIO_BLOCK_SIZE],
+                                     float aGainL, float aGainR,
+                                     bool aIsOnTheLeft,
+                                     float aOutputL[WEBAUDIO_BLOCK_SIZE],
+                                     float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
+  __m128 vinl0, vinr0, vinl1, vinr1, vout0, vout1, vscaled0, vscaled1, vgainl,
+      vgainr;
+
+  ASSERT_ALIGNED16(aInputL);
+  ASSERT_ALIGNED16(aInputR);
+  ASSERT_ALIGNED16(aOutputL);
+  ASSERT_ALIGNED16(aOutputR);
+
+  vgainl = _mm_load1_ps(&aGainL);
+  vgainr = _mm_load1_ps(&aGainR);
+
+  if (aIsOnTheLeft) {
+    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+      vinl0 = _mm_load_ps(&aInputL[i]);
+      vinr0 = _mm_load_ps(&aInputR[i]);
+      vinl1 = _mm_load_ps(&aInputL[i + 4]);
+      vinr1 = _mm_load_ps(&aInputR[i + 4]);
+
+      /* left channel : aOutputL  = aInputL + aInputR * gainL */
+      vscaled0 = _mm_mul_ps(vinr0, vgainl);
+      vscaled1 = _mm_mul_ps(vinr1, vgainl);
+      vout0 = _mm_add_ps(vscaled0, vinl0);
+      vout1 = _mm_add_ps(vscaled1, vinl1);
+      _mm_store_ps(&aOutputL[i], vout0);
+      _mm_store_ps(&aOutputL[i + 4], vout1);
+
+      /* right channel : aOutputR = aInputR * gainR */
+      vscaled0 = _mm_mul_ps(vinr0, vgainr);
+      vscaled1 = _mm_mul_ps(vinr1, vgainr);
+      _mm_store_ps(&aOutputR[i], vscaled0);
+      _mm_store_ps(&aOutputR[i + 4], vscaled1);
+    }
+  } else {
+    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+      vinl0 = _mm_load_ps(&aInputL[i]);
+      vinr0 = _mm_load_ps(&aInputR[i]);
+      vinl1 = _mm_load_ps(&aInputL[i + 4]);
+      vinr1 = _mm_load_ps(&aInputR[i + 4]);
+
+      /* left channel : aInputL * gainL */
+      vscaled0 = _mm_mul_ps(vinl0, vgainl);
+      vscaled1 = _mm_mul_ps(vinl1, vgainl);
+      _mm_store_ps(&aOutputL[i], vscaled0);
+      _mm_store_ps(&aOutputL[i + 4], vscaled1);
+
+      /* right channel: aOutputR = aInputR + aInputL * gainR */
+      vscaled0 = _mm_mul_ps(vinl0, vgainr);
+      vscaled1 = _mm_mul_ps(vinl1, vgainr);
+      vout0 = _mm_add_ps(vscaled0, vinr0);
+      vout1 = _mm_add_ps(vscaled1, vinr1);
+      _mm_store_ps(&aOutputR[i], vout0);
+      _mm_store_ps(&aOutputR[i + 4], vout1);
+    }
+  }
+}
+
+void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
+                               float* aOutput, uint32_t aSize) {
+  unsigned i;
+  __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
+      outimag1, outimag2, outimag3;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aScale);
+  ASSERT_ALIGNED16(aOutput);
+  ASSERT_MULTIPLE16(aSize);
+
+  for (i = 0; i < aSize * 2; i += 16) {
+    in0 = _mm_load_ps(&aInput[i]);
+    in1 = _mm_load_ps(&aInput[i + 4]);
+    in2 = _mm_load_ps(&aInput[i + 8]);
+    in3 = _mm_load_ps(&aInput[i + 12]);
+
+    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
+    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
+
+    in0 = _mm_load_ps(&aScale[i]);
+    in1 = _mm_load_ps(&aScale[i + 4]);
+    in2 = _mm_load_ps(&aScale[i + 8]);
+    in3 = _mm_load_ps(&aScale[i + 12]);
+
+    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
+    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
+
+    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
+                     _mm_mul_ps(outimag0, outimag1));
+    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
+                     _mm_mul_ps(outimag0, outreal1));
+    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
+                     _mm_mul_ps(outimag2, outimag3));
+    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
+                     _mm_mul_ps(outimag2, outreal3));
+
+    outreal0 = _mm_unpacklo_ps(in0, in1);
+    outreal1 = _mm_unpackhi_ps(in0, in1);
+    outreal2 = _mm_unpacklo_ps(in2, in3);
+    outreal3 = _mm_unpackhi_ps(in2, in3);
+
+    _mm_store_ps(&aOutput[i], outreal0);
+    _mm_store_ps(&aOutput[i + 4], outreal1);
+    _mm_store_ps(&aOutput[i + 8], outreal2);
+    _mm_store_ps(&aOutput[i + 12], outreal3);
+  }
+}
+
+float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) {
+  unsigned i;
+  __m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3;
+  float out[4];
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_MULTIPLE16(aLength);
+
+  acc0 = _mm_setzero_ps();
+  acc1 = _mm_setzero_ps();
+  acc2 = _mm_setzero_ps();
+  acc3 = _mm_setzero_ps();
+
+  for (i = 0; i < aLength; i += 16) {
+    in0 = _mm_load_ps(&aInput[i]);
+    in1 = _mm_load_ps(&aInput[i + 4]);
+    in2 = _mm_load_ps(&aInput[i + 8]);
+    in3 = _mm_load_ps(&aInput[i + 12]);
+
+    in0 = _mm_mul_ps(in0, in0);
+    in1 = _mm_mul_ps(in1, in1);
+    in2 = _mm_mul_ps(in2, in2);
+    in3 = _mm_mul_ps(in3, in3);
+
+    acc0 = _mm_add_ps(acc0, in0);
+    acc1 = _mm_add_ps(acc1, in1);
+    acc2 = _mm_add_ps(acc2, in2);
+    acc3 = _mm_add_ps(acc3, in3);
+  }
+
+  acc0 = _mm_add_ps(acc0, acc1);
+  acc0 = _mm_add_ps(acc0, acc2);
+  acc0 = _mm_add_ps(acc0, acc3);
+
+  _mm_store_ps(out, acc0);
+
+  return out[0] + out[1] + out[2] + out[3];
+}
+
+void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount) {
+  __m128 vin0, vin1, vin2, vin3;
+  __m128 vmask0, vmask1, vmask2, vmask3;
+  __m128 vout0, vout1, vout2, vout3;
+
+  float* samplesAligned16 = ALIGNED16(aSamples);
+  size_t leadingElementsScalar =
+      std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
+  size_t remainingElements = aCount - leadingElementsScalar;
+  size_t vectoredEnd = aCount - remainingElements % 16;
+
+  MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) % 16));
+
+  size_t i = 0;
+  for (; i < leadingElementsScalar; i++) {
+    if (aSamples[i] != aSamples[i]) {
+      aSamples[i] = 0.0;
+    }
+  }
+
+  ASSERT_ALIGNED16(&aSamples[i]);
+
+  for (; i < vectoredEnd; i += 16) {
+    vin0 = _mm_load_ps(&aSamples[i + 0]);
+    vin1 = _mm_load_ps(&aSamples[i + 4]);
+    vin2 = _mm_load_ps(&aSamples[i + 8]);
+    vin3 = _mm_load_ps(&aSamples[i + 12]);
+
+    vmask0 = _mm_cmpord_ps(vin0, vin0);
+    vmask1 = _mm_cmpord_ps(vin1, vin1);
+    vmask2 = _mm_cmpord_ps(vin2, vin2);
+    vmask3 = _mm_cmpord_ps(vin3, vin3);
+
+    vout0 = _mm_and_ps(vin0, vmask0);
+    vout1 = _mm_and_ps(vin1, vmask1);
+    vout2 = _mm_and_ps(vin2, vmask2);
+    vout3 = _mm_and_ps(vin3, vmask3);
+
+    _mm_store_ps(&aSamples[i + 0], vout0);
+    _mm_store_ps(&aSamples[i + 4], vout1);
+    _mm_store_ps(&aSamples[i + 8], vout2);
+    _mm_store_ps(&aSamples[i + 12], vout3);
+  }
+  for (; i < aCount; i++) {
+    if (aSamples[i] != aSamples[i]) {
+      aSamples[i] = 0.0;
+    }
+  }
+}
+
+}  // namespace mozilla