16 files changed, 5239 insertions, 0 deletions
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.cc
new file mode 100644
index 0000000000..3d766929c6
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.cc
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2016 The WebRTC Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/utility/block_mean_calculator.h"
+
+#include "rtc_base/checks.h"
+
+namespace webrtc {
+
+BlockMeanCalculator::BlockMeanCalculator(size_t block_length)
+    : block_length_(block_length),
+      count_(0),
+      sum_(0.0),
+      mean_(0.0) {
+  RTC_DCHECK(block_length_ != 0);
+}
+
+void BlockMeanCalculator::Reset() {
+  Clear();
+  mean_ = 0.0;
+}
+
+void BlockMeanCalculator::AddValue(float value) {
+  sum_ += value;
+  ++count_;
+  if (count_ == block_length_) {
+    mean_ = sum_ / block_length_;
+    Clear();
+  }
+}
+
+bool BlockMeanCalculator::EndOfBlock() const {
+  return count_ == 0;
+}
+
+float BlockMeanCalculator::GetLatestMean() const {
+  return mean_;
+}
+
+// Flush all samples added.
+void BlockMeanCalculator::Clear() {
+  count_ = 0;
+  sum_ = 0.0;
+}
+
+}  // namespace webrtc
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.h
new file mode 100644
index 0000000000..cfa7cfbeba
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2016 The WebRTC Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_BLOCK_MEAN_CALCULATOR_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_BLOCK_MEAN_CALCULATOR_H_
+
+#include <stddef.h>
+
+#include "rtc_base/constructormagic.h"
+
+namespace webrtc {
+
+// BlockMeanCalculator calculates the mean of a block of values. Values are
+// added one after another, and the mean is updated at the end of every block.
+class BlockMeanCalculator {
+ public:
+  explicit BlockMeanCalculator(size_t block_length);
+
+  // Reset.
+  void Reset();
+
+  // Add one value to the sequence.
+  void AddValue(float value);
+
+  // Return whether the latest added value was at the end of a block.
+  bool EndOfBlock() const;
+
+  // Return the latest mean.
+  float GetLatestMean() const;
+
+ private:
+  // Clear all values added.
+  void Clear();
+
+  const size_t block_length_;
+  size_t count_;
+  float sum_;
+  float mean_;
+
+  RTC_DISALLOW_COPY_AND_ASSIGN(BlockMeanCalculator);
+};
+
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_BLOCK_MEAN_CALCULATOR_H_
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator_unittest.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator_unittest.cc
new file mode 100644
index 0000000000..1f4ebf1b67
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator_unittest.cc
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/utility/block_mean_calculator.h"
+#include "test/gtest.h"
+
+namespace webrtc {
+
+TEST(MeanCalculatorTest, Correctness) {
+  const size_t kBlockLength = 10;
+  BlockMeanCalculator mean_calculator(kBlockLength);
+  size_t i = 0;
+  float reference = 0.0;
+
+  for (; i < kBlockLength - 1; ++i) {
+    mean_calculator.AddValue(static_cast<float>(i));
+    EXPECT_FALSE(mean_calculator.EndOfBlock());
+  }
+  mean_calculator.AddValue(static_cast<float>(i++));
+  EXPECT_TRUE(mean_calculator.EndOfBlock());
+
+  for (; i < 3 * kBlockLength; ++i) {
+    const bool end_of_block = i % kBlockLength == 0;
+    if (end_of_block) {
+      // Sum of (i - kBlockLength) ... (i - 1)
+      reference = i - 0.5 * (1 + kBlockLength);
+    }
+    EXPECT_EQ(mean_calculator.EndOfBlock(), end_of_block);
+    EXPECT_EQ(reference, mean_calculator.GetLatestMean());
+    mean_calculator.AddValue(static_cast<float>(i));
+  }
+}
+
+TEST(MeanCalculatorTest, Reset) {
+  const size_t kBlockLength = 10;
+  BlockMeanCalculator mean_calculator(kBlockLength);
+  for (size_t i = 0; i < kBlockLength - 1; ++i) {
+    mean_calculator.AddValue(static_cast<float>(i));
+  }
+  mean_calculator.Reset();
+  size_t i = 0;
+  for (; i < kBlockLength - 1; ++i) {
+    mean_calculator.AddValue(static_cast<float>(i));
+    EXPECT_FALSE(mean_calculator.EndOfBlock());
+  }
+  mean_calculator.AddValue(static_cast<float>(i));
+  EXPECT_TRUE(mean_calculator.EndOfBlock());
+  EXPECT_EQ(mean_calculator.GetLatestMean(), 0.5 * (kBlockLength - 1));
+}
+
+}  // namespace webrtc
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.cc
new file mode 100644
index 0000000000..871b541651
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.cc
@@ -0,0 +1,703 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/utility/delay_estimator.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+#include "rtc_base/checks.h"
+
+// Number of right shifts for scaling is linearly depending on number of bits in
+// the far-end binary spectrum.
+static const int kShiftsAtZero = 13;  // Right shifts at zero binary spectrum.
+static const int kShiftsLinearSlope = 3;
+
+static const int32_t kProbabilityOffset = 1024;  // 2 in Q9.
+static const int32_t kProbabilityLowerLimit = 8704;  // 17 in Q9.
+static const int32_t kProbabilityMinSpread = 2816;  // 5.5 in Q9.
+
+// Robust validation settings
+static const float kHistogramMax = 3000.f;
+static const float kLastHistogramMax = 250.f;
+static const float kMinHistogramThreshold = 1.5f;
+static const int kMinRequiredHits = 10;
+static const int kMaxHitsWhenPossiblyNonCausal = 10;
+static const int kMaxHitsWhenPossiblyCausal = 1000;
+static const float kQ14Scaling = 1.f / (1 << 14);  // Scaling by 2^14 to get Q0.
+static const float kFractionSlope = 0.05f;
+static const float kMinFractionWhenPossiblyCausal = 0.5f;
+static const float kMinFractionWhenPossiblyNonCausal = 0.25f;
+
+// Counts and returns number of bits of a 32-bit word.
+static int BitCount(uint32_t u32) {
+  uint32_t tmp = u32 - ((u32 >> 1) & 033333333333) -
+      ((u32 >> 2) & 011111111111);
+  tmp = ((tmp + (tmp >> 3)) & 030707070707);
+  tmp = (tmp + (tmp >> 6));
+  tmp = (tmp + (tmp >> 12) + (tmp >> 24)) & 077;
+
+  return ((int) tmp);
+}
+
+// Compares the |binary_vector| with all rows of the |binary_matrix| and counts
+// per row the number of times they have the same value.
+//
+// Inputs:
+//      - binary_vector     : binary "vector" stored in a long
+//      - binary_matrix     : binary "matrix" stored as a vector of long
+//      - matrix_size       : size of binary "matrix"
+//
+// Output:
+//      - bit_counts        : "Vector" stored as a long, containing for each
+//                            row the number of times the matrix row and the
+//                            input vector have the same value
+//
+static void BitCountComparison(uint32_t binary_vector,
+                               const uint32_t* binary_matrix,
+                               int matrix_size,
+                               int32_t* bit_counts) {
+  int n = 0;
+
+  // Compare |binary_vector| with all rows of the |binary_matrix|
+  for (; n < matrix_size; n++) {
+    bit_counts[n] = (int32_t) BitCount(binary_vector ^ binary_matrix[n]);
+  }
+}
+
+// Collects necessary statistics for the HistogramBasedValidation().  This
+// function has to be called prior to calling HistogramBasedValidation().  The
+// statistics updated and used by the HistogramBasedValidation() are:
+//  1. the number of |candidate_hits|, which states for how long we have had the
+//     same |candidate_delay|
+//  2. the |histogram| of candidate delays over time.  This histogram is
+//     weighted with respect to a reliability measure and time-varying to cope
+//     with possible delay shifts.
+// For further description see commented code.
+//
+// Inputs:
+//  - candidate_delay   : The delay to validate.
+//  - valley_depth_q14  : The cost function has a valley/minimum at the
+//                        |candidate_delay| location.  |valley_depth_q14| is the
+//                        cost function difference between the minimum and
+//                        maximum locations.  The value is in the Q14 domain.
+//  - valley_level_q14  : Is the cost function value at the minimum, in Q14.
+static void UpdateRobustValidationStatistics(BinaryDelayEstimator* self,
+                                             int candidate_delay,
+                                             int32_t valley_depth_q14,
+                                             int32_t valley_level_q14) {
+  const float valley_depth = valley_depth_q14 * kQ14Scaling;
+  float decrease_in_last_set = valley_depth;
+  const int max_hits_for_slow_change = (candidate_delay < self->last_delay) ?
+      kMaxHitsWhenPossiblyNonCausal : kMaxHitsWhenPossiblyCausal;
+  int i = 0;
+
+  RTC_DCHECK_EQ(self->history_size, self->farend->history_size);
+  // Reset |candidate_hits| if we have a new candidate.
+  if (candidate_delay != self->last_candidate_delay) {
+    self->candidate_hits = 0;
+    self->last_candidate_delay = candidate_delay;
+  }
+  self->candidate_hits++;
+
+  // The |histogram| is updated differently across the bins.
+  // 1. The |candidate_delay| histogram bin is increased with the
+  //    |valley_depth|, which is a simple measure of how reliable the
+  //    |candidate_delay| is.  The histogram is not increased above
+  //    |kHistogramMax|.
+  self->histogram[candidate_delay] += valley_depth;
+  if (self->histogram[candidate_delay] > kHistogramMax) {
+    self->histogram[candidate_delay] = kHistogramMax;
+  }
+  // 2. The histogram bins in the neighborhood of |candidate_delay| are
+  //    unaffected.  The neighborhood is defined as x + {-2, -1, 0, 1}.
+  // 3. The histogram bins in the neighborhood of |last_delay| are decreased
+  //    with |decrease_in_last_set|.  This value equals the difference between
+  //    the cost function values at the locations |candidate_delay| and
+  //    |last_delay| until we reach |max_hits_for_slow_change| consecutive hits
+  //    at the |candidate_delay|.  If we exceed this amount of hits the
+  //    |candidate_delay| is a "potential" candidate and we start decreasing
+  //    these histogram bins more rapidly with |valley_depth|.
+  if (self->candidate_hits < max_hits_for_slow_change) {
+    decrease_in_last_set = (self->mean_bit_counts[self->compare_delay] -
+        valley_level_q14) * kQ14Scaling;
+  }
+  // 4. All other bins are decreased with |valley_depth|.
+  // TODO(bjornv): Investigate how to make this loop more efficient.  Split up
+  // the loop?  Remove parts that doesn't add too much.
+  for (i = 0; i < self->history_size; ++i) {
+    int is_in_last_set = (i >= self->last_delay - 2) &&
+        (i <= self->last_delay + 1) && (i != candidate_delay);
+    int is_in_candidate_set = (i >= candidate_delay - 2) &&
+        (i <= candidate_delay + 1);
+    self->histogram[i] -= decrease_in_last_set * is_in_last_set +
+        valley_depth * (!is_in_last_set && !is_in_candidate_set);
+    // 5. No histogram bin can go below 0.
+    if (self->histogram[i] < 0) {
+      self->histogram[i] = 0;
+    }
+  }
+}
+
+// Validates the |candidate_delay|, estimated in WebRtc_ProcessBinarySpectrum(),
+// based on a mix of counting concurring hits with a modified histogram
+// of recent delay estimates.  In brief a candidate is valid (returns 1) if it
+// is the most likely according to the histogram.  There are a couple of
+// exceptions that are worth mentioning:
+//  1. If the |candidate_delay| < |last_delay| it can be that we are in a
+//     non-causal state, breaking a possible echo control algorithm.  Hence, we
+//     open up for a quicker change by allowing the change even if the
+//     |candidate_delay| is not the most likely one according to the histogram.
+//  2. There's a minimum number of hits (kMinRequiredHits) and the histogram
+//     value has to reached a minimum (kMinHistogramThreshold) to be valid.
+//  3. The action is also depending on the filter length used for echo control.
+//     If the delay difference is larger than what the filter can capture, we
+//     also move quicker towards a change.
+// For further description see commented code.
+//
+// Input:
+//  - candidate_delay     : The delay to validate.
+//
+// Return value:
+//  - is_histogram_valid  : 1 - The |candidate_delay| is valid.
+//                          0 - Otherwise.
+static int HistogramBasedValidation(const BinaryDelayEstimator* self,
+                                    int candidate_delay) {
+  float fraction = 1.f;
+  float histogram_threshold = self->histogram[self->compare_delay];
+  const int delay_difference = candidate_delay - self->last_delay;
+  int is_histogram_valid = 0;
+
+  // The histogram based validation of |candidate_delay| is done by comparing
+  // the |histogram| at bin |candidate_delay| with a |histogram_threshold|.
+  // This |histogram_threshold| equals a |fraction| of the |histogram| at bin
+  // |last_delay|.  The |fraction| is a piecewise linear function of the
+  // |delay_difference| between the |candidate_delay| and the |last_delay|
+  // allowing for a quicker move if
+  //  i) a potential echo control filter can not handle these large differences.
+  // ii) keeping |last_delay| instead of updating to |candidate_delay| could
+  //     force an echo control into a non-causal state.
+  // We further require the histogram to have reached a minimum value of
+  // |kMinHistogramThreshold|.  In addition, we also require the number of
+  // |candidate_hits| to be more than |kMinRequiredHits| to remove spurious
+  // values.
+
+  // Calculate a comparison histogram value (|histogram_threshold|) that is
+  // depending on the distance between the |candidate_delay| and |last_delay|.
+  // TODO(bjornv): How much can we gain by turning the fraction calculation
+  // into tables?
+  if (delay_difference > self->allowed_offset) {
+    fraction = 1.f - kFractionSlope * (delay_difference - self->allowed_offset);
+    fraction = (fraction > kMinFractionWhenPossiblyCausal ? fraction :
+        kMinFractionWhenPossiblyCausal);
+  } else if (delay_difference < 0) {
+    fraction = kMinFractionWhenPossiblyNonCausal -
+        kFractionSlope * delay_difference;
+    fraction = (fraction > 1.f ? 1.f : fraction);
+  }
+  histogram_threshold *= fraction;
+  histogram_threshold = (histogram_threshold > kMinHistogramThreshold ?
+      histogram_threshold : kMinHistogramThreshold);
+
+  is_histogram_valid =
+      (self->histogram[candidate_delay] >= histogram_threshold) &&
+      (self->candidate_hits > kMinRequiredHits);
+
+  return is_histogram_valid;
+}
+
+// Performs a robust validation of the |candidate_delay| estimated in
+// WebRtc_ProcessBinarySpectrum().  The algorithm takes the
+// |is_instantaneous_valid| and the |is_histogram_valid| and combines them
+// into a robust validation.  The HistogramBasedValidation() has to be called
+// prior to this call.
+// For further description on how the combination is done, see commented code.
+//
+// Inputs:
+//  - candidate_delay         : The delay to validate.
+//  - is_instantaneous_valid  : The instantaneous validation performed in
+//                              WebRtc_ProcessBinarySpectrum().
+//  - is_histogram_valid      : The histogram based validation.
+//
+// Return value:
+//  - is_robust               : 1 - The candidate_delay is valid according to a
+//                                  combination of the two inputs.
+//                            : 0 - Otherwise.
+static int RobustValidation(const BinaryDelayEstimator* self,
+                            int candidate_delay,
+                            int is_instantaneous_valid,
+                            int is_histogram_valid) {
+  int is_robust = 0;
+
+  // The final robust validation is based on the two algorithms; 1) the
+  // |is_instantaneous_valid| and 2) the histogram based with result stored in
+  // |is_histogram_valid|.
+  //   i) Before we actually have a valid estimate (|last_delay| == -2), we say
+  //      a candidate is valid if either algorithm states so
+  //      (|is_instantaneous_valid| OR |is_histogram_valid|).
+  is_robust = (self->last_delay < 0) &&
+      (is_instantaneous_valid || is_histogram_valid);
+  //  ii) Otherwise, we need both algorithms to be certain
+  //      (|is_instantaneous_valid| AND |is_histogram_valid|)
+  is_robust |= is_instantaneous_valid && is_histogram_valid;
+  // iii) With one exception, i.e., the histogram based algorithm can overrule
+  //      the instantaneous one if |is_histogram_valid| = 1 and the histogram
+  //      is significantly strong.
+  is_robust |= is_histogram_valid &&
+      (self->histogram[candidate_delay] > self->last_delay_histogram);
+
+  return is_robust;
+}
+
+void WebRtc_FreeBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self) {
+
+  if (self == NULL) {
+    return;
+  }
+
+  free(self->binary_far_history);
+  self->binary_far_history = NULL;
+
+  free(self->far_bit_counts);
+  self->far_bit_counts = NULL;
+
+  free(self);
+}
+
+BinaryDelayEstimatorFarend* WebRtc_CreateBinaryDelayEstimatorFarend(
+    int history_size) {
+  BinaryDelayEstimatorFarend* self = NULL;
+
+  if (history_size > 1) {
+    // Sanity conditions fulfilled.
+    self = static_cast<BinaryDelayEstimatorFarend*>(
+        malloc(sizeof(BinaryDelayEstimatorFarend)));
+  }
+  if (self == NULL) {
+    return NULL;
+  }
+
+  self->history_size = 0;
+  self->binary_far_history = NULL;
+  self->far_bit_counts = NULL;
+  if (WebRtc_AllocateFarendBufferMemory(self, history_size) == 0) {
+    WebRtc_FreeBinaryDelayEstimatorFarend(self);
+    self = NULL;
+  }
+  return self;
+}
+
+int WebRtc_AllocateFarendBufferMemory(BinaryDelayEstimatorFarend* self,
+                                      int history_size) {
+  RTC_DCHECK(self);
+  // (Re-)Allocate memory for history buffers.
+  self->binary_far_history = static_cast<uint32_t*>(
+      realloc(self->binary_far_history,
+              history_size * sizeof(*self->binary_far_history)));
+  self->far_bit_counts = static_cast<int*>(
+      realloc(self->far_bit_counts,
+              history_size * sizeof(*self->far_bit_counts)));
+  if ((self->binary_far_history == NULL) || (self->far_bit_counts == NULL)) {
+    history_size = 0;
+  }
+  // Fill with zeros if we have expanded the buffers.
+  if (history_size > self->history_size) {
+    int size_diff = history_size - self->history_size;
+    memset(&self->binary_far_history[self->history_size],
+           0,
+           sizeof(*self->binary_far_history) * size_diff);
+    memset(&self->far_bit_counts[self->history_size],
+           0,
+           sizeof(*self->far_bit_counts) * size_diff);
+  }
+  self->history_size = history_size;
+
+  return self->history_size;
+}
+
+void WebRtc_InitBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self) {
+  RTC_DCHECK(self);
+  memset(self->binary_far_history, 0, sizeof(uint32_t) * self->history_size);
+  memset(self->far_bit_counts, 0, sizeof(int) * self->history_size);
+}
+
+void WebRtc_SoftResetBinaryDelayEstimatorFarend(
+    BinaryDelayEstimatorFarend* self, int delay_shift) {
+  int abs_shift = abs(delay_shift);
+  int shift_size = 0;
+  int dest_index = 0;
+  int src_index = 0;
+  int padding_index = 0;
+
+  RTC_DCHECK(self);
+  shift_size = self->history_size - abs_shift;
+  RTC_DCHECK_GT(shift_size, 0);
+  if (delay_shift == 0) {
+    return;
+  } else if (delay_shift > 0) {
+    dest_index = abs_shift;
+  } else if (delay_shift < 0) {
+    src_index = abs_shift;
+    padding_index = shift_size;
+  }
+
+  // Shift and zero pad buffers.
+  memmove(&self->binary_far_history[dest_index],
+          &self->binary_far_history[src_index],
+          sizeof(*self->binary_far_history) * shift_size);
+  memset(&self->binary_far_history[padding_index], 0,
+         sizeof(*self->binary_far_history) * abs_shift);
+  memmove(&self->far_bit_counts[dest_index],
+          &self->far_bit_counts[src_index],
+          sizeof(*self->far_bit_counts) * shift_size);
+  memset(&self->far_bit_counts[padding_index], 0,
+         sizeof(*self->far_bit_counts) * abs_shift);
+}
+
+void WebRtc_AddBinaryFarSpectrum(BinaryDelayEstimatorFarend* handle,
+                                 uint32_t binary_far_spectrum) {
+  RTC_DCHECK(handle);
+  // Shift binary spectrum history and insert current |binary_far_spectrum|.
+  memmove(&(handle->binary_far_history[1]), &(handle->binary_far_history[0]),
+          (handle->history_size - 1) * sizeof(uint32_t));
+  handle->binary_far_history[0] = binary_far_spectrum;
+
+  // Shift history of far-end binary spectrum bit counts and insert bit count
+  // of current |binary_far_spectrum|.
+  memmove(&(handle->far_bit_counts[1]), &(handle->far_bit_counts[0]),
+          (handle->history_size - 1) * sizeof(int));
+  handle->far_bit_counts[0] = BitCount(binary_far_spectrum);
+}
+
+void WebRtc_FreeBinaryDelayEstimator(BinaryDelayEstimator* self) {
+
+  if (self == NULL) {
+    return;
+  }
+
+  free(self->mean_bit_counts);
+  self->mean_bit_counts = NULL;
+
+  free(self->bit_counts);
+  self->bit_counts = NULL;
+
+  free(self->binary_near_history);
+  self->binary_near_history = NULL;
+
+  free(self->histogram);
+  self->histogram = NULL;
+
+  // BinaryDelayEstimator does not have ownership of |farend|, hence we do not
+  // free the memory here. That should be handled separately by the user.
+  self->farend = NULL;
+
+  free(self);
+}
+
+BinaryDelayEstimator* WebRtc_CreateBinaryDelayEstimator(
+    BinaryDelayEstimatorFarend* farend, int max_lookahead) {
+  BinaryDelayEstimator* self = NULL;
+
+  if ((farend != NULL) && (max_lookahead >= 0)) {
+    // Sanity conditions fulfilled.
+    self = static_cast<BinaryDelayEstimator*>(
+        malloc(sizeof(BinaryDelayEstimator)));
+  }
+  if (self == NULL) {
+    return NULL;
+  }
+
+  self->farend = farend;
+  self->near_history_size = max_lookahead + 1;
+  self->history_size = 0;
+  self->robust_validation_enabled = 0;  // Disabled by default.
+  self->allowed_offset = 0;
+
+  self->lookahead = max_lookahead;
+
+  // Allocate memory for spectrum and history buffers.
+  self->mean_bit_counts = NULL;
+  self->bit_counts = NULL;
+  self->histogram = NULL;
+  self->binary_near_history = static_cast<uint32_t*>(
+      malloc((max_lookahead + 1) * sizeof(*self->binary_near_history)));
+  if (self->binary_near_history == NULL ||
+      WebRtc_AllocateHistoryBufferMemory(self, farend->history_size) == 0) {
+    WebRtc_FreeBinaryDelayEstimator(self);
+    self = NULL;
+  }
+
+  return self;
+}
+
+int WebRtc_AllocateHistoryBufferMemory(BinaryDelayEstimator* self,
+                                       int history_size) {
+  BinaryDelayEstimatorFarend* far = self->farend;
+  // (Re-)Allocate memory for spectrum and history buffers.
+  if (history_size != far->history_size) {
+    // Only update far-end buffers if we need.
+    history_size = WebRtc_AllocateFarendBufferMemory(far, history_size);
+  }
+  // The extra array element in |mean_bit_counts| and |histogram| is a dummy
+  // element only used while |last_delay| == -2, i.e., before we have a valid
+  // estimate.
+  self->mean_bit_counts = static_cast<int32_t*>(
+      realloc(self->mean_bit_counts,
+              (history_size + 1) * sizeof(*self->mean_bit_counts)));
+  self->bit_counts = static_cast<int32_t*>(
+      realloc(self->bit_counts, history_size * sizeof(*self->bit_counts)));
+  self->histogram = static_cast<float*>(
+      realloc(self->histogram, (history_size + 1) * sizeof(*self->histogram)));
+
+  if ((self->mean_bit_counts == NULL) ||
+      (self->bit_counts == NULL) ||
+      (self->histogram == NULL)) {
+    history_size = 0;
+  }
+  // Fill with zeros if we have expanded the buffers.
+  if (history_size > self->history_size) {
+    int size_diff = history_size - self->history_size;
+    memset(&self->mean_bit_counts[self->history_size],
+           0,
+           sizeof(*self->mean_bit_counts) * size_diff);
+    memset(&self->bit_counts[self->history_size],
+           0,
+           sizeof(*self->bit_counts) * size_diff);
+    memset(&self->histogram[self->history_size],
+           0,
+           sizeof(*self->histogram) * size_diff);
+  }
+  self->history_size = history_size;
+
+  return self->history_size;
+}
+
+void WebRtc_InitBinaryDelayEstimator(BinaryDelayEstimator* self) {
+  int i = 0;
+  RTC_DCHECK(self);
+
+  memset(self->bit_counts, 0, sizeof(int32_t) * self->history_size);
+  memset(self->binary_near_history,
+         0,
+         sizeof(uint32_t) * self->near_history_size);
+  for (i = 0; i <= self->history_size; ++i) {
+    self->mean_bit_counts[i] = (20 << 9);  // 20 in Q9.
+    self->histogram[i] = 0.f;
+  }
+  self->minimum_probability = kMaxBitCountsQ9;  // 32 in Q9.
+  self->last_delay_probability = (int) kMaxBitCountsQ9;  // 32 in Q9.
+
+  // Default return value if we're unable to estimate. -1 is used for errors.
+  self->last_delay = -2;
+
+  self->last_candidate_delay = -2;
+  self->compare_delay = self->history_size;
+  self->candidate_hits = 0;
+  self->last_delay_histogram = 0.f;
+}
+
+int WebRtc_SoftResetBinaryDelayEstimator(BinaryDelayEstimator* self,
+                                         int delay_shift) {
+  int lookahead = 0;
+  RTC_DCHECK(self);
+  lookahead = self->lookahead;
+  self->lookahead -= delay_shift;
+  if (self->lookahead < 0) {
+    self->lookahead = 0;
+  }
+  if (self->lookahead > self->near_history_size - 1) {
+    self->lookahead = self->near_history_size - 1;
+  }
+  return lookahead - self->lookahead;
+}
+
+int WebRtc_ProcessBinarySpectrum(BinaryDelayEstimator* self,
+                                 uint32_t binary_near_spectrum) {
+  int i = 0;
+  int candidate_delay = -1;
+  int valid_candidate = 0;
+
+  int32_t value_best_candidate = kMaxBitCountsQ9;
+  int32_t value_worst_candidate = 0;
+  int32_t valley_depth = 0;
+
+  RTC_DCHECK(self);
+  if (self->farend->history_size != self->history_size) {
+    // Non matching history sizes.
+    return -1;
+  }
+  if (self->near_history_size > 1) {
+    // If we apply lookahead, shift near-end binary spectrum history. Insert
+    // current |binary_near_spectrum| and pull out the delayed one.
+    memmove(&(self->binary_near_history[1]), &(self->binary_near_history[0]),
+            (self->near_history_size - 1) * sizeof(uint32_t));
+    self->binary_near_history[0] = binary_near_spectrum;
+    binary_near_spectrum = self->binary_near_history[self->lookahead];
+  }
+
+  // Compare with delayed spectra and store the |bit_counts| for each delay.
+  BitCountComparison(binary_near_spectrum, self->farend->binary_far_history,
+                     self->history_size, self->bit_counts);
+
+  // Update |mean_bit_counts|, which is the smoothed version of |bit_counts|.
+  for (i = 0; i < self->history_size; i++) {
+    // |bit_counts| is constrained to [0, 32], meaning we can smooth with a
+    // factor up to 2^26. We use Q9.
+    int32_t bit_count = (self->bit_counts[i] << 9);  // Q9.
+
+    // Update |mean_bit_counts| only when far-end signal has something to
+    // contribute. If |far_bit_counts| is zero the far-end signal is weak and
+    // we likely have a poor echo condition, hence don't update.
+    if (self->farend->far_bit_counts[i] > 0) {
+      // Make number of right shifts piecewise linear w.r.t. |far_bit_counts|.
+      int shifts = kShiftsAtZero;
+      shifts -= (kShiftsLinearSlope * self->farend->far_bit_counts[i]) >> 4;
+      WebRtc_MeanEstimatorFix(bit_count, shifts, &(self->mean_bit_counts[i]));
+    }
+  }
+
+  // Find |candidate_delay|, |value_best_candidate| and |value_worst_candidate|
+  // of |mean_bit_counts|.
+  for (i = 0; i < self->history_size; i++) {
+    if (self->mean_bit_counts[i] < value_best_candidate) {
+      value_best_candidate = self->mean_bit_counts[i];
+      candidate_delay = i;
+    }
+    if (self->mean_bit_counts[i] > value_worst_candidate) {
+      value_worst_candidate = self->mean_bit_counts[i];
+    }
+  }
+  valley_depth = value_worst_candidate - value_best_candidate;
+
+  // The |value_best_candidate| is a good indicator on the probability of
+  // |candidate_delay| being an accurate delay (a small |value_best_candidate|
+  // means a good binary match). In the following sections we make a decision
+  // whether to update |last_delay| or not.
+  // 1) If the difference bit counts between the best and the worst delay
+  //    candidates is too small we consider the situation to be unreliable and
+  //    don't update |last_delay|.
+  // 2) If the situation is reliable we update |last_delay| if the value of the
+  //    best candidate delay has a value less than
+  //     i) an adaptive threshold |minimum_probability|, or
+  //    ii) this corresponding value |last_delay_probability|, but updated at
+  //        this time instant.
+
+  // Update |minimum_probability|.
+  if ((self->minimum_probability > kProbabilityLowerLimit) &&
+      (valley_depth > kProbabilityMinSpread)) {
+    // The "hard" threshold can't be lower than 17 (in Q9).
+    // The valley in the curve also has to be distinct, i.e., the
+    // difference between |value_worst_candidate| and |value_best_candidate| has
+    // to be large enough.
+    int32_t threshold = value_best_candidate + kProbabilityOffset;
+    if (threshold < kProbabilityLowerLimit) {
+      threshold = kProbabilityLowerLimit;
+    }
+    if (self->minimum_probability > threshold) {
+      self->minimum_probability = threshold;
+    }
+  }
+  // Update |last_delay_probability|.
+  // We use a Markov type model, i.e., a slowly increasing level over time.
+  self->last_delay_probability++;
+  // Validate |candidate_delay|.  We have a reliable instantaneous delay
+  // estimate if
+  //  1) The valley is distinct enough (|valley_depth| > |kProbabilityOffset|)
+  // and
+  //  2) The depth of the valley is deep enough
+  //      (|value_best_candidate| < |minimum_probability|)
+  //     and deeper than the best estimate so far
+  //      (|value_best_candidate| < |last_delay_probability|)
+  valid_candidate = ((valley_depth > kProbabilityOffset) &&
+      ((value_best_candidate < self->minimum_probability) ||
+          (value_best_candidate < self->last_delay_probability)));
+
+  // Check for nonstationary farend signal.
+  const bool non_stationary_farend =
+      std::any_of(self->farend->far_bit_counts,
+                  self->farend->far_bit_counts + self->history_size,
+                  [](int a) { return a > 0; });
+
+  if (non_stationary_farend) {
+    // Only update the validation statistics when the farend is nonstationary
+    // as the underlying estimates are otherwise frozen.
+    UpdateRobustValidationStatistics(self, candidate_delay, valley_depth,
+                                     value_best_candidate);
+  }
+
+  if (self->robust_validation_enabled) {
+    int is_histogram_valid = HistogramBasedValidation(self, candidate_delay);
+    valid_candidate = RobustValidation(self, candidate_delay, valid_candidate,
+                                       is_histogram_valid);
+
+  }
+
+  // Only update the delay estimate when the farend is nonstationary and when
+  // a valid delay candidate is available.
+  if (non_stationary_farend && valid_candidate) {
+    if (candidate_delay != self->last_delay) {
+      self->last_delay_histogram =
+          (self->histogram[candidate_delay] > kLastHistogramMax ?
+              kLastHistogramMax : self->histogram[candidate_delay]);
+      // Adjust the histogram if we made a change to |last_delay|, though it was
+      // not the most likely one according to the histogram.
+      if (self->histogram[candidate_delay] <
+          self->histogram[self->compare_delay]) {
+        self->histogram[self->compare_delay] = self->histogram[candidate_delay];
+      }
+    }
+    self->last_delay = candidate_delay;
+    if (value_best_candidate < self->last_delay_probability) {
+      self->last_delay_probability = value_best_candidate;
+    }
+    self->compare_delay = self->last_delay;
+  }
+
+  return self->last_delay;
+}
+
+int WebRtc_binary_last_delay(BinaryDelayEstimator* self) {
+  RTC_DCHECK(self);
+  return self->last_delay;
+}
+
+float WebRtc_binary_last_delay_quality(BinaryDelayEstimator* self) {
+  float quality = 0;
+  RTC_DCHECK(self);
+
+  if (self->robust_validation_enabled) {
+    // Simply a linear function of the histogram height at delay estimate.
+    quality = self->histogram[self->compare_delay] / kHistogramMax;
+  } else {
+    // Note that |last_delay_probability| states how deep the minimum of the
+    // cost function is, so it is rather an error probability.
+    quality = (float) (kMaxBitCountsQ9 - self->last_delay_probability) /
+        kMaxBitCountsQ9;
+    if (quality < 0) {
+      quality = 0;
+    }
+  }
+  return quality;
+}
+
+void WebRtc_MeanEstimatorFix(int32_t new_value,
+                             int factor,
+                             int32_t* mean_value) {
+  int32_t diff = new_value - *mean_value;
+
+  // mean_new = mean_value + ((new_value - mean_value) >> factor);
+  if (diff < 0) {
+    diff = -((-diff) >> factor);
+  } else {
+    diff = (diff >> factor);
+  }
+  *mean_value += diff;
+}
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.h
new file mode 100644
index 0000000000..cce6113a53
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.h
@@ -0,0 +1,251 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Performs delay estimation on binary converted spectra.
+// The return value is  0 - OK and -1 - Error, unless otherwise stated.
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_H_
+
+#include "typedefs.h"  // NOLINT(build/include)
+
+static const int32_t kMaxBitCountsQ9 = (32 << 9);  // 32 matching bits in Q9.
+
+typedef struct {
+  // Pointer to bit counts.
+  int* far_bit_counts;
+  // Binary history variables.
+  uint32_t* binary_far_history;
+  int history_size;
+} BinaryDelayEstimatorFarend;
+
+typedef struct {
+  // Pointer to bit counts.
+  int32_t* mean_bit_counts;
+  // Array only used locally in ProcessBinarySpectrum() but whose size is
+  // determined at run-time.
+  int32_t* bit_counts;
+
+  // Binary history variables.
+  uint32_t* binary_near_history;
+  int near_history_size;
+  int history_size;
+
+  // Delay estimation variables.
+  int32_t minimum_probability;
+  int last_delay_probability;
+
+  // Delay memory.
+  int last_delay;
+
+  // Robust validation
+  int robust_validation_enabled;
+  int allowed_offset;
+  int last_candidate_delay;
+  int compare_delay;
+  int candidate_hits;
+  float* histogram;
+  float last_delay_histogram;
+
+  // For dynamically changing the lookahead when using SoftReset...().
+  int lookahead;
+
+  // Far-end binary spectrum history buffer etc.
+  BinaryDelayEstimatorFarend* farend;
+} BinaryDelayEstimator;
+
+// Releases the memory allocated by
+// WebRtc_CreateBinaryDelayEstimatorFarend(...).
+// Input:
+//    - self              : Pointer to the binary delay estimation far-end
+//                          instance which is the return value of
+//                          WebRtc_CreateBinaryDelayEstimatorFarend().
+//
+void WebRtc_FreeBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self);
+
+// Allocates the memory needed by the far-end part of the binary delay
+// estimation. The memory needs to be initialized separately through
+// WebRtc_InitBinaryDelayEstimatorFarend(...).
+//
+// Inputs:
+//      - history_size    : Size of the far-end binary spectrum history.
+//
+// Return value:
+//      - BinaryDelayEstimatorFarend*
+//                        : Created |handle|. If the memory can't be allocated
+//                          or if any of the input parameters are invalid NULL
+//                          is returned.
+//
+BinaryDelayEstimatorFarend* WebRtc_CreateBinaryDelayEstimatorFarend(
+    int history_size);
+
+// Re-allocates the buffers.
+//
+// Inputs:
+//      - self            : Pointer to the binary estimation far-end instance
+//                          which is the return value of
+//                          WebRtc_CreateBinaryDelayEstimatorFarend().
+//      - history_size    : Size of the far-end binary spectrum history.
+//
+// Return value:
+//      - history_size    : The history size allocated.
+int WebRtc_AllocateFarendBufferMemory(BinaryDelayEstimatorFarend* self,
+                                      int history_size);
+
+// Initializes the delay estimation far-end instance created with
+// WebRtc_CreateBinaryDelayEstimatorFarend(...).
+//
+// Input:
+//    - self              : Pointer to the delay estimation far-end instance.
+//
+// Output:
+//    - self              : Initialized far-end instance.
+//
+void WebRtc_InitBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self);
+
+// Soft resets the delay estimation far-end instance created with
+// WebRtc_CreateBinaryDelayEstimatorFarend(...).
+//
+// Input:
+//    - delay_shift   : The amount of blocks to shift history buffers.
+//
+void WebRtc_SoftResetBinaryDelayEstimatorFarend(
+    BinaryDelayEstimatorFarend* self, int delay_shift);
+
+// Adds the binary far-end spectrum to the internal far-end history buffer. This
+// spectrum is used as reference when calculating the delay using
+// WebRtc_ProcessBinarySpectrum().
+//
+// Inputs:
+//    - self                  : Pointer to the delay estimation far-end
+//                              instance.
+//    - binary_far_spectrum   : Far-end binary spectrum.
+//
+// Output:
+//    - self                  : Updated far-end instance.
+//
+void WebRtc_AddBinaryFarSpectrum(BinaryDelayEstimatorFarend* self,
+                                 uint32_t binary_far_spectrum);
+
+// Releases the memory allocated by WebRtc_CreateBinaryDelayEstimator(...).
+//
+// Note that BinaryDelayEstimator utilizes BinaryDelayEstimatorFarend, but does
+// not take ownership of it, hence the BinaryDelayEstimator has to be torn down
+// before the far-end.
+//
+// Input:
+//    - self              : Pointer to the binary delay estimation instance
+//                          which is the return value of
+//                          WebRtc_CreateBinaryDelayEstimator().
+//
+void WebRtc_FreeBinaryDelayEstimator(BinaryDelayEstimator* self);
+
+// Allocates the memory needed by the binary delay estimation. The memory needs
+// to be initialized separately through WebRtc_InitBinaryDelayEstimator(...).
+//
+// See WebRtc_CreateDelayEstimator(..) in delay_estimator_wrapper.c for detailed
+// description.
+BinaryDelayEstimator* WebRtc_CreateBinaryDelayEstimator(
+    BinaryDelayEstimatorFarend* farend, int max_lookahead);
+
+// Re-allocates |history_size| dependent buffers. The far-end buffers will be
+// updated at the same time if needed.
+//
+// Input:
+//      - self            : Pointer to the binary estimation instance which is
+//                          the return value of
+//                          WebRtc_CreateBinaryDelayEstimator().
+//      - history_size    : Size of the history buffers.
+//
+// Return value:
+//      - history_size    : The history size allocated.
+int WebRtc_AllocateHistoryBufferMemory(BinaryDelayEstimator* self,
+                                       int history_size);
+
+// Initializes the delay estimation instance created with
+// WebRtc_CreateBinaryDelayEstimator(...).
+//
+// Input:
+//    - self              : Pointer to the delay estimation instance.
+//
+// Output:
+//    - self              : Initialized instance.
+//
+void WebRtc_InitBinaryDelayEstimator(BinaryDelayEstimator* self);
+
+// Soft resets the delay estimation instance created with
+// WebRtc_CreateBinaryDelayEstimator(...).
+//
+// Input:
+//    - delay_shift   : The amount of blocks to shift history buffers.
+//
+// Return value:
+//    - actual_shifts : The actual number of shifts performed.
+//
+int WebRtc_SoftResetBinaryDelayEstimator(BinaryDelayEstimator* self,
+                                         int delay_shift);
+
+// Estimates and returns the delay between the binary far-end and binary near-
+// end spectra. It is assumed the binary far-end spectrum has been added using
+// WebRtc_AddBinaryFarSpectrum() prior to this call. The value will be offset by
+// the lookahead (i.e. the lookahead should be subtracted from the returned
+// value).
+//
+// Inputs:
+//    - self                  : Pointer to the delay estimation instance.
+//    - binary_near_spectrum  : Near-end binary spectrum of the current block.
+//
+// Output:
+//    - self                  : Updated instance.
+//
+// Return value:
+//    - delay                 :  >= 0 - Calculated delay value.
+//                              -2    - Insufficient data for estimation.
+//
+int WebRtc_ProcessBinarySpectrum(BinaryDelayEstimator* self,
+                                 uint32_t binary_near_spectrum);
+
+// Returns the last calculated delay updated by the function
+// WebRtc_ProcessBinarySpectrum(...).
+//
+// Input:
+//    - self                  : Pointer to the delay estimation instance.
+//
+// Return value:
+//    - delay                 :  >= 0 - Last calculated delay value
+//                              -2    - Insufficient data for estimation.
+//
+int WebRtc_binary_last_delay(BinaryDelayEstimator* self);
+
+// Returns the estimation quality of the last calculated delay updated by the
+// function WebRtc_ProcessBinarySpectrum(...). The estimation quality is a value
+// in the interval [0, 1].  The higher the value, the better the quality.
+//
+// Return value:
+//    - delay_quality         :  >= 0 - Estimation quality of last calculated
+//                                      delay value.
+float WebRtc_binary_last_delay_quality(BinaryDelayEstimator* self);
+
+// Updates the |mean_value| recursively with a step size of 2^-|factor|. This
+// function is used internally in the Binary Delay Estimator as well as the
+// Fixed point wrapper.
+//
+// Inputs:
+//    - new_value             : The new value the mean should be updated with.
+//    - factor                : The step size, in number of right shifts.
+//
+// Input/Output:
+//    - mean_value            : Pointer to the mean value.
+//
+void WebRtc_MeanEstimatorFix(int32_t new_value,
+                             int factor,
+                             int32_t* mean_value);
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_H_
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_internal.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_internal.h
new file mode 100644
index 0000000000..46eea3ec18
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_internal.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Header file including the delay estimator handle used for testing.
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_INTERNAL_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_INTERNAL_H_
+
+#include "modules/audio_processing/utility/delay_estimator.h"
+#include "typedefs.h"  // NOLINT(build/include)
+
+typedef union {
+  float float_;
+  int32_t int32_;
+} SpectrumType;
+
+typedef struct {
+  // Pointers to mean values of spectrum.
+  SpectrumType* mean_far_spectrum;
+  // |mean_far_spectrum| initialization indicator.
+  int far_spectrum_initialized;
+
+  int spectrum_size;
+
+  // Far-end part of binary spectrum based delay estimation.
+  BinaryDelayEstimatorFarend* binary_farend;
+} DelayEstimatorFarend;
+
+typedef struct {
+  // Pointers to mean values of spectrum.
+  SpectrumType* mean_near_spectrum;
+  // |mean_near_spectrum| initialization indicator.
+  int near_spectrum_initialized;
+
+  int spectrum_size;
+
+  // Binary spectrum based delay estimator
+  BinaryDelayEstimator* binary_handle;
+} DelayEstimator;
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_INTERNAL_H_
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_unittest.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_unittest.cc
new file mode 100644
index 0000000000..36700e5706
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_unittest.cc
@@ -0,0 +1,618 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/utility/delay_estimator.h"
+#include "modules/audio_processing/utility/delay_estimator_internal.h"
+#include "modules/audio_processing/utility/delay_estimator_wrapper.h"
+#include "test/gtest.h"
+#include "typedefs.h"  // NOLINT(build/include)
+
+namespace {
+
+enum { kSpectrumSize = 65 };
+// Delay history sizes.
+enum { kMaxDelay = 100 };
+enum { kLookahead = 10 };
+enum { kHistorySize = kMaxDelay + kLookahead };
+// Length of binary spectrum sequence.
+enum { kSequenceLength = 400 };
+
+const int kDifferentHistorySize = 3;
+const int kDifferentLookahead = 1;
+
+const int kEnable[] = { 0, 1 };
+const size_t kSizeEnable = sizeof(kEnable) / sizeof(*kEnable);
+
+class DelayEstimatorTest : public ::testing::Test {
+ protected:
+  DelayEstimatorTest();
+  virtual void SetUp();
+  virtual void TearDown();
+
+  void Init();
+  void InitBinary();
+  void VerifyDelay(BinaryDelayEstimator* binary_handle, int offset, int delay);
+  void RunBinarySpectra(BinaryDelayEstimator* binary1,
+                        BinaryDelayEstimator* binary2,
+                        int near_offset, int lookahead_offset, int far_offset);
+  void RunBinarySpectraTest(int near_offset, int lookahead_offset,
+                            int ref_robust_validation, int robust_validation);
+
+  void* handle_;
+  DelayEstimator* self_;
+  void* farend_handle_;
+  DelayEstimatorFarend* farend_self_;
+  BinaryDelayEstimator* binary_;
+  BinaryDelayEstimatorFarend* binary_farend_;
+  int spectrum_size_;
+  // Dummy input spectra.
+  float far_f_[kSpectrumSize];
+  float near_f_[kSpectrumSize];
+  uint16_t far_u16_[kSpectrumSize];
+  uint16_t near_u16_[kSpectrumSize];
+  uint32_t binary_spectrum_[kSequenceLength + kHistorySize];
+};
+
+DelayEstimatorTest::DelayEstimatorTest()
+    : handle_(NULL),
+      self_(NULL),
+      farend_handle_(NULL),
+      farend_self_(NULL),
+      binary_(NULL),
+      binary_farend_(NULL),
+      spectrum_size_(kSpectrumSize) {
+  // Dummy input data are set with more or less arbitrary non-zero values.
+  memset(far_f_, 1, sizeof(far_f_));
+  memset(near_f_, 2, sizeof(near_f_));
+  memset(far_u16_, 1, sizeof(far_u16_));
+  memset(near_u16_, 2, sizeof(near_u16_));
+  // Construct a sequence of binary spectra used to verify delay estimate. The
+  // |kSequenceLength| has to be long enough for the delay estimation to leave
+  // the initialized state.
+  binary_spectrum_[0] = 1;
+  for (int i = 1; i < (kSequenceLength + kHistorySize); i++) {
+    binary_spectrum_[i] = 3 * binary_spectrum_[i - 1];
+  }
+}
+
+void DelayEstimatorTest::SetUp() {
+  farend_handle_ = WebRtc_CreateDelayEstimatorFarend(kSpectrumSize,
+                                                     kHistorySize);
+  ASSERT_TRUE(farend_handle_ != NULL);
+  farend_self_ = reinterpret_cast<DelayEstimatorFarend*>(farend_handle_);
+  handle_ = WebRtc_CreateDelayEstimator(farend_handle_, kLookahead);
+  ASSERT_TRUE(handle_ != NULL);
+  self_ = reinterpret_cast<DelayEstimator*>(handle_);
+  binary_farend_ = WebRtc_CreateBinaryDelayEstimatorFarend(kHistorySize);
+  ASSERT_TRUE(binary_farend_ != NULL);
+  binary_ = WebRtc_CreateBinaryDelayEstimator(binary_farend_, kLookahead);
+  ASSERT_TRUE(binary_ != NULL);
+}
+
+void DelayEstimatorTest::TearDown() {
+  WebRtc_FreeDelayEstimator(handle_);
+  handle_ = NULL;
+  self_ = NULL;
+  WebRtc_FreeDelayEstimatorFarend(farend_handle_);
+  farend_handle_ = NULL;
+  farend_self_ = NULL;
+  WebRtc_FreeBinaryDelayEstimator(binary_);
+  binary_ = NULL;
+  WebRtc_FreeBinaryDelayEstimatorFarend(binary_farend_);
+  binary_farend_ = NULL;
+}
+
+void DelayEstimatorTest::Init() {
+  // Initialize Delay Estimator
+  EXPECT_EQ(0, WebRtc_InitDelayEstimatorFarend(farend_handle_));
+  EXPECT_EQ(0, WebRtc_InitDelayEstimator(handle_));
+  // Verify initialization.
+  EXPECT_EQ(0, farend_self_->far_spectrum_initialized);
+  EXPECT_EQ(0, self_->near_spectrum_initialized);
+  EXPECT_EQ(-2, WebRtc_last_delay(handle_));  // Delay in initial state.
+  EXPECT_FLOAT_EQ(0, WebRtc_last_delay_quality(handle_));  // Zero quality.
+}
+
+void DelayEstimatorTest::InitBinary() {
+  // Initialize Binary Delay Estimator (far-end part).
+  WebRtc_InitBinaryDelayEstimatorFarend(binary_farend_);
+  // Initialize Binary Delay Estimator
+  WebRtc_InitBinaryDelayEstimator(binary_);
+  // Verify initialization. This does not guarantee a complete check, since
+  // |last_delay| may be equal to -2 before initialization if done on the fly.
+  EXPECT_EQ(-2, binary_->last_delay);
+}
+
+void DelayEstimatorTest::VerifyDelay(BinaryDelayEstimator* binary_handle,
+                                     int offset, int delay) {
+  // Verify that we WebRtc_binary_last_delay() returns correct delay.
+  EXPECT_EQ(delay, WebRtc_binary_last_delay(binary_handle));
+
+  if (delay != -2) {
+    // Verify correct delay estimate. In the non-causal case the true delay
+    // is equivalent with the |offset|.
+    EXPECT_EQ(offset, delay);
+  }
+}
+
+void DelayEstimatorTest::RunBinarySpectra(BinaryDelayEstimator* binary1,
+                                          BinaryDelayEstimator* binary2,
+                                          int near_offset,
+                                          int lookahead_offset,
+                                          int far_offset) {
+  int different_validations = binary1->robust_validation_enabled ^
+      binary2->robust_validation_enabled;
+  WebRtc_InitBinaryDelayEstimatorFarend(binary_farend_);
+  WebRtc_InitBinaryDelayEstimator(binary1);
+  WebRtc_InitBinaryDelayEstimator(binary2);
+  // Verify initialization. This does not guarantee a complete check, since
+  // |last_delay| may be equal to -2 before initialization if done on the fly.
+  EXPECT_EQ(-2, binary1->last_delay);
+  EXPECT_EQ(-2, binary2->last_delay);
+  for (int i = kLookahead; i < (kSequenceLength + kLookahead); i++) {
+    WebRtc_AddBinaryFarSpectrum(binary_farend_,
+                                binary_spectrum_[i + far_offset]);
+    int delay_1 = WebRtc_ProcessBinarySpectrum(binary1, binary_spectrum_[i]);
+    int delay_2 =
+        WebRtc_ProcessBinarySpectrum(binary2,
+                                     binary_spectrum_[i - near_offset]);
+
+    VerifyDelay(binary1, far_offset + kLookahead, delay_1);
+    VerifyDelay(binary2,
+                far_offset + kLookahead + lookahead_offset + near_offset,
+                delay_2);
+    // Expect the two delay estimates to be offset by |lookahead_offset| +
+    // |near_offset| when we have left the initial state.
+    if ((delay_1 != -2) && (delay_2 != -2)) {
+      EXPECT_EQ(delay_1, delay_2 - lookahead_offset - near_offset);
+    }
+    // For the case of identical signals |delay_1| and |delay_2| should match
+    // all the time, unless one of them has robust validation turned on.  In
+    // that case the robust validation leaves the initial state faster.
+    if ((near_offset == 0) && (lookahead_offset == 0)) {
+      if  (!different_validations) {
+        EXPECT_EQ(delay_1, delay_2);
+      } else {
+        if (binary1->robust_validation_enabled) {
+          EXPECT_GE(delay_1, delay_2);
+        } else {
+          EXPECT_GE(delay_2, delay_1);
+        }
+      }
+    }
+  }
+  // Verify that we have left the initialized state.
+  EXPECT_NE(-2, WebRtc_binary_last_delay(binary1));
+  EXPECT_LT(0, WebRtc_binary_last_delay_quality(binary1));
+  EXPECT_NE(-2, WebRtc_binary_last_delay(binary2));
+  EXPECT_LT(0, WebRtc_binary_last_delay_quality(binary2));
+}
+
+void DelayEstimatorTest::RunBinarySpectraTest(int near_offset,
+                                              int lookahead_offset,
+                                              int ref_robust_validation,
+                                              int robust_validation) {
+  BinaryDelayEstimator* binary2 =
+      WebRtc_CreateBinaryDelayEstimator(binary_farend_,
+                                        kLookahead + lookahead_offset);
+  // Verify the delay for both causal and non-causal systems. For causal systems
+  // the delay is equivalent with a positive |offset| of the far-end sequence.
+  // For non-causal systems the delay is equivalent with a negative |offset| of
+  // the far-end sequence.
+  binary_->robust_validation_enabled = ref_robust_validation;
+  binary2->robust_validation_enabled = robust_validation;
+  for (int offset = -kLookahead;
+      offset < kMaxDelay - lookahead_offset - near_offset;
+      offset++) {
+    RunBinarySpectra(binary_, binary2, near_offset, lookahead_offset, offset);
+  }
+  WebRtc_FreeBinaryDelayEstimator(binary2);
+  binary2 = NULL;
+  binary_->robust_validation_enabled = 0;  // Reset reference.
+}
+
+TEST_F(DelayEstimatorTest, CorrectErrorReturnsOfWrapper) {
+  // In this test we verify correct error returns on invalid API calls.
+
+  // WebRtc_CreateDelayEstimatorFarend() and WebRtc_CreateDelayEstimator()
+  // should return a NULL pointer on invalid input values.
+  // Make sure we have a non-NULL value at start, so we can detect NULL after
+  // create failure.
+  void* handle = farend_handle_;
+  handle = WebRtc_CreateDelayEstimatorFarend(33, kHistorySize);
+  EXPECT_TRUE(handle == NULL);
+  handle = WebRtc_CreateDelayEstimatorFarend(kSpectrumSize, 1);
+  EXPECT_TRUE(handle == NULL);
+
+  handle = handle_;
+  handle = WebRtc_CreateDelayEstimator(NULL, kLookahead);
+  EXPECT_TRUE(handle == NULL);
+  handle = WebRtc_CreateDelayEstimator(farend_handle_, -1);
+  EXPECT_TRUE(handle == NULL);
+
+  // WebRtc_InitDelayEstimatorFarend() and WebRtc_InitDelayEstimator() should
+  // return -1 if we have a NULL pointer as |handle|.
+  EXPECT_EQ(-1, WebRtc_InitDelayEstimatorFarend(NULL));
+  EXPECT_EQ(-1, WebRtc_InitDelayEstimator(NULL));
+
+  // WebRtc_AddFarSpectrumFloat() should return -1 if we have:
+  // 1) NULL pointer as |handle|.
+  // 2) NULL pointer as far-end spectrum.
+  // 3) Incorrect spectrum size.
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFloat(NULL, far_f_, spectrum_size_));
+  // Use |farend_handle_| which is properly created at SetUp().
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFloat(farend_handle_, NULL,
+                                           spectrum_size_));
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFloat(farend_handle_, far_f_,
+                                           spectrum_size_ + 1));
+
+  // WebRtc_AddFarSpectrumFix() should return -1 if we have:
+  // 1) NULL pointer as |handle|.
+  // 2) NULL pointer as far-end spectrum.
+  // 3) Incorrect spectrum size.
+  // 4) Too high precision in far-end spectrum (Q-domain > 15).
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(NULL, far_u16_, spectrum_size_, 0));
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(farend_handle_, NULL, spectrum_size_,
+                                         0));
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_,
+                                         spectrum_size_ + 1, 0));
+  EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_,
+                                         spectrum_size_, 16));
+
+  // WebRtc_set_history_size() should return -1 if:
+  // 1) |handle| is a NULL.
+  // 2) |history_size| <= 1.
+  EXPECT_EQ(-1, WebRtc_set_history_size(NULL, 1));
+  EXPECT_EQ(-1, WebRtc_set_history_size(handle_, 1));
+  // WebRtc_history_size() should return -1 if:
+  // 1) NULL pointer input.
+  EXPECT_EQ(-1, WebRtc_history_size(NULL));
+  // 2) there is a mismatch between history size.
+  void* tmp_handle = WebRtc_CreateDelayEstimator(farend_handle_, kHistorySize);
+  EXPECT_EQ(0, WebRtc_InitDelayEstimator(tmp_handle));
+  EXPECT_EQ(kDifferentHistorySize,
+            WebRtc_set_history_size(tmp_handle, kDifferentHistorySize));
+  EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(tmp_handle));
+  EXPECT_EQ(kHistorySize, WebRtc_set_history_size(handle_, kHistorySize));
+  EXPECT_EQ(-1, WebRtc_history_size(tmp_handle));
+
+  // WebRtc_set_lookahead() should return -1 if we try a value outside the
+  /// buffer.
+  EXPECT_EQ(-1, WebRtc_set_lookahead(handle_, kLookahead + 1));
+  EXPECT_EQ(-1, WebRtc_set_lookahead(handle_, -1));
+
+  // WebRtc_set_allowed_offset() should return -1 if we have:
+  // 1) NULL pointer as |handle|.
+  // 2) |allowed_offset| < 0.
+  EXPECT_EQ(-1, WebRtc_set_allowed_offset(NULL, 0));
+  EXPECT_EQ(-1, WebRtc_set_allowed_offset(handle_, -1));
+
+  EXPECT_EQ(-1, WebRtc_get_allowed_offset(NULL));
+
+  // WebRtc_enable_robust_validation() should return -1 if we have:
+  // 1) NULL pointer as |handle|.
+  // 2) Incorrect |enable| value (not 0 or 1).
+  EXPECT_EQ(-1, WebRtc_enable_robust_validation(NULL, kEnable[0]));
+  EXPECT_EQ(-1, WebRtc_enable_robust_validation(handle_, -1));
+  EXPECT_EQ(-1, WebRtc_enable_robust_validation(handle_, 2));
+
+  // WebRtc_is_robust_validation_enabled() should return -1 if we have NULL
+  // pointer as |handle|.
+  EXPECT_EQ(-1, WebRtc_is_robust_validation_enabled(NULL));
+
+  // WebRtc_DelayEstimatorProcessFloat() should return -1 if we have:
+  // 1) NULL pointer as |handle|.
+  // 2) NULL pointer as near-end spectrum.
+  // 3) Incorrect spectrum size.
+  // 4) Non matching history sizes if multiple delay estimators using the same
+  //    far-end reference.
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(NULL, near_f_,
+                                                  spectrum_size_));
+  // Use |handle_| which is properly created at SetUp().
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(handle_, NULL,
+                                                  spectrum_size_));
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(handle_, near_f_,
+                                                  spectrum_size_ + 1));
+  // |tmp_handle| is already in a non-matching state.
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(tmp_handle,
+                                                  near_f_,
+                                                  spectrum_size_));
+
+  // WebRtc_DelayEstimatorProcessFix() should return -1 if we have:
+  // 1) NULL pointer as |handle|.
+  // 2) NULL pointer as near-end spectrum.
+  // 3) Incorrect spectrum size.
+  // 4) Too high precision in near-end spectrum (Q-domain > 15).
+  // 5) Non matching history sizes if multiple delay estimators using the same
+  //    far-end reference.
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(NULL, near_u16_, spectrum_size_,
+                                                0));
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(handle_, NULL, spectrum_size_,
+                                                0));
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(handle_, near_u16_,
+                                                spectrum_size_ + 1, 0));
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(handle_, near_u16_,
+                                                spectrum_size_, 16));
+  // |tmp_handle| is already in a non-matching state.
+  EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(tmp_handle,
+                                                near_u16_,
+                                                spectrum_size_,
+                                                0));
+  WebRtc_FreeDelayEstimator(tmp_handle);
+
+  // WebRtc_last_delay() should return -1 if we have a NULL pointer as |handle|.
+  EXPECT_EQ(-1, WebRtc_last_delay(NULL));
+
+  // Free any local memory if needed.
+  WebRtc_FreeDelayEstimator(handle);
+}
+
+TEST_F(DelayEstimatorTest, VerifyAllowedOffset) {
+  // Is set to zero by default.
+  EXPECT_EQ(0, WebRtc_get_allowed_offset(handle_));
+  for (int i = 1; i >= 0; i--) {
+    EXPECT_EQ(0, WebRtc_set_allowed_offset(handle_, i));
+    EXPECT_EQ(i, WebRtc_get_allowed_offset(handle_));
+    Init();
+    // Unaffected over a reset.
+    EXPECT_EQ(i, WebRtc_get_allowed_offset(handle_));
+  }
+}
+
+TEST_F(DelayEstimatorTest, VerifyEnableRobustValidation) {
+  // Disabled by default.
+  EXPECT_EQ(0, WebRtc_is_robust_validation_enabled(handle_));
+  for (size_t i = 0; i < kSizeEnable; ++i) {
+    EXPECT_EQ(0, WebRtc_enable_robust_validation(handle_, kEnable[i]));
+    EXPECT_EQ(kEnable[i], WebRtc_is_robust_validation_enabled(handle_));
+    Init();
+    // Unaffected over a reset.
+    EXPECT_EQ(kEnable[i], WebRtc_is_robust_validation_enabled(handle_));
+  }
+}
+
+TEST_F(DelayEstimatorTest, InitializedSpectrumAfterProcess) {
+  // In this test we verify that the mean spectra are initialized after first
+  // time we call WebRtc_AddFarSpectrum() and Process() respectively. The test
+  // also verifies the state is not left for zero spectra.
+  const float kZerosFloat[kSpectrumSize] = { 0.0 };
+  const uint16_t kZerosU16[kSpectrumSize] = { 0 };
+
+  // For floating point operations, process one frame and verify initialization
+  // flag.
+  Init();
+  EXPECT_EQ(0, WebRtc_AddFarSpectrumFloat(farend_handle_, kZerosFloat,
+                                          spectrum_size_));
+  EXPECT_EQ(0, farend_self_->far_spectrum_initialized);
+  EXPECT_EQ(0, WebRtc_AddFarSpectrumFloat(farend_handle_, far_f_,
+                                           spectrum_size_));
+  EXPECT_EQ(1, farend_self_->far_spectrum_initialized);
+  EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFloat(handle_, kZerosFloat,
+                                                  spectrum_size_));
+  EXPECT_EQ(0, self_->near_spectrum_initialized);
+  EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFloat(handle_, near_f_,
+                                                  spectrum_size_));
+  EXPECT_EQ(1, self_->near_spectrum_initialized);
+
+  // For fixed point operations, process one frame and verify initialization
+  // flag.
+  Init();
+  EXPECT_EQ(0, WebRtc_AddFarSpectrumFix(farend_handle_, kZerosU16,
+                                        spectrum_size_, 0));
+  EXPECT_EQ(0, farend_self_->far_spectrum_initialized);
+  EXPECT_EQ(0, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_,
+                                         spectrum_size_, 0));
+  EXPECT_EQ(1, farend_self_->far_spectrum_initialized);
+  EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFix(handle_, kZerosU16,
+                                                spectrum_size_, 0));
+  EXPECT_EQ(0, self_->near_spectrum_initialized);
+  EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFix(handle_, near_u16_,
+                                                spectrum_size_, 0));
+  EXPECT_EQ(1, self_->near_spectrum_initialized);
+}
+
+TEST_F(DelayEstimatorTest, CorrectLastDelay) {
+  // In this test we verify that we get the correct last delay upon valid call.
+  // We simply process the same data until we leave the initialized state
+  // (|last_delay| = -2). Then we compare the Process() output with the
+  // last_delay() call.
+
+  // TODO(bjornv): Update quality values for robust validation.
+  int last_delay = 0;
+  // Floating point operations.
+  Init();
+  for (int i = 0; i < 200; i++) {
+    EXPECT_EQ(0, WebRtc_AddFarSpectrumFloat(farend_handle_, far_f_,
+                                            spectrum_size_));
+    last_delay = WebRtc_DelayEstimatorProcessFloat(handle_, near_f_,
+                                                   spectrum_size_);
+    if (last_delay != -2) {
+      EXPECT_EQ(last_delay, WebRtc_last_delay(handle_));
+      if (!WebRtc_is_robust_validation_enabled(handle_)) {
+        EXPECT_FLOAT_EQ(7203.f / kMaxBitCountsQ9,
+                        WebRtc_last_delay_quality(handle_));
+      }
+      break;
+    }
+  }
+  // Verify that we have left the initialized state.
+  EXPECT_NE(-2, WebRtc_last_delay(handle_));
+  EXPECT_LT(0, WebRtc_last_delay_quality(handle_));
+
+  // Fixed point operations.
+  Init();
+  for (int i = 0; i < 200; i++) {
+    EXPECT_EQ(0, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_,
+                                          spectrum_size_, 0));
+    last_delay = WebRtc_DelayEstimatorProcessFix(handle_, near_u16_,
+                                                 spectrum_size_, 0);
+    if (last_delay != -2) {
+      EXPECT_EQ(last_delay, WebRtc_last_delay(handle_));
+      if (!WebRtc_is_robust_validation_enabled(handle_)) {
+        EXPECT_FLOAT_EQ(7203.f / kMaxBitCountsQ9,
+                        WebRtc_last_delay_quality(handle_));
+      }
+      break;
+    }
+  }
+  // Verify that we have left the initialized state.
+  EXPECT_NE(-2, WebRtc_last_delay(handle_));
+  EXPECT_LT(0, WebRtc_last_delay_quality(handle_));
+}
+
+TEST_F(DelayEstimatorTest, CorrectErrorReturnsOfBinaryEstimatorFarend) {
+  // In this test we verify correct output on invalid API calls to the Binary
+  // Delay Estimator (far-end part).
+
+  BinaryDelayEstimatorFarend* binary = binary_farend_;
+  // WebRtc_CreateBinaryDelayEstimatorFarend() should return -1 if the input
+  // history size is less than 2. This is to make sure the buffer shifting
+  // applies properly.
+  // Make sure we have a non-NULL value at start, so we can detect NULL after
+  // create failure.
+  binary = WebRtc_CreateBinaryDelayEstimatorFarend(1);
+  EXPECT_TRUE(binary == NULL);
+}
+
+TEST_F(DelayEstimatorTest, CorrectErrorReturnsOfBinaryEstimator) {
+  // In this test we verify correct output on invalid API calls to the Binary
+  // Delay Estimator.
+
+  BinaryDelayEstimator* binary_handle = binary_;
+  // WebRtc_CreateBinaryDelayEstimator() should return -1 if we have a NULL
+  // pointer as |binary_farend| or invalid input values. Upon failure, the
+  // |binary_handle| should be NULL.
+  // Make sure we have a non-NULL value at start, so we can detect NULL after
+  // create failure.
+  binary_handle = WebRtc_CreateBinaryDelayEstimator(NULL, kLookahead);
+  EXPECT_TRUE(binary_handle == NULL);
+  binary_handle = WebRtc_CreateBinaryDelayEstimator(binary_farend_, -1);
+  EXPECT_TRUE(binary_handle == NULL);
+}
+
+TEST_F(DelayEstimatorTest, MeanEstimatorFix) {
+  // In this test we verify that we update the mean value in correct direction
+  // only. With "direction" we mean increase or decrease.
+
+  int32_t mean_value = 4000;
+  int32_t mean_value_before = mean_value;
+  int32_t new_mean_value = mean_value * 2;
+
+  // Increasing |mean_value|.
+  WebRtc_MeanEstimatorFix(new_mean_value, 10, &mean_value);
+  EXPECT_LT(mean_value_before, mean_value);
+  EXPECT_GT(new_mean_value, mean_value);
+
+  // Decreasing |mean_value|.
+  new_mean_value = mean_value / 2;
+  mean_value_before = mean_value;
+  WebRtc_MeanEstimatorFix(new_mean_value, 10, &mean_value);
+  EXPECT_GT(mean_value_before, mean_value);
+  EXPECT_LT(new_mean_value, mean_value);
+}
+
+TEST_F(DelayEstimatorTest, ExactDelayEstimateMultipleNearSameSpectrum) {
+  // In this test we verify that we get the correct delay estimates if we shift
+  // the signal accordingly. We create two Binary Delay Estimators and feed them
+  // with the same signals, so they should output the same results.
+  // We verify both causal and non-causal delays.
+  // For these noise free signals, the robust validation should not have an
+  // impact, hence we turn robust validation on/off for both reference and
+  // delayed near end.
+
+  for (size_t i = 0; i < kSizeEnable; ++i) {
+    for (size_t j = 0; j < kSizeEnable; ++j) {
+      RunBinarySpectraTest(0, 0, kEnable[i], kEnable[j]);
+    }
+  }
+}
+
+TEST_F(DelayEstimatorTest, ExactDelayEstimateMultipleNearDifferentSpectrum) {
+  // In this test we use the same setup as above, but we now feed the two Binary
+  // Delay Estimators with different signals, so they should output different
+  // results.
+  // For these noise free signals, the robust validation should not have an
+  // impact, hence we turn robust validation on/off for both reference and
+  // delayed near end.
+
+  const int kNearOffset = 1;
+  for (size_t i = 0; i < kSizeEnable; ++i) {
+    for (size_t j = 0; j < kSizeEnable; ++j) {
+      RunBinarySpectraTest(kNearOffset, 0, kEnable[i], kEnable[j]);
+    }
+  }
+}
+
+TEST_F(DelayEstimatorTest, ExactDelayEstimateMultipleNearDifferentLookahead) {
+  // In this test we use the same setup as above, feeding the two Binary
+  // Delay Estimators with the same signals. The difference is that we create
+  // them with different lookahead.
+  // For these noise free signals, the robust validation should not have an
+  // impact, hence we turn robust validation on/off for both reference and
+  // delayed near end.
+
+  const int kLookaheadOffset = 1;
+  for (size_t i = 0; i < kSizeEnable; ++i) {
+    for (size_t j = 0; j < kSizeEnable; ++j) {
+      RunBinarySpectraTest(0, kLookaheadOffset, kEnable[i], kEnable[j]);
+    }
+  }
+}
+
+TEST_F(DelayEstimatorTest, AllowedOffsetNoImpactWhenRobustValidationDisabled) {
+  // The same setup as in ExactDelayEstimateMultipleNearSameSpectrum with the
+  // difference that |allowed_offset| is set for the reference binary delay
+  // estimator.
+
+  binary_->allowed_offset = 10;
+  RunBinarySpectraTest(0, 0, 0, 0);
+  binary_->allowed_offset = 0;  // Reset reference.
+}
+
+TEST_F(DelayEstimatorTest, VerifyLookaheadAtCreate) {
+  void* farend_handle = WebRtc_CreateDelayEstimatorFarend(kSpectrumSize,
+                                                          kMaxDelay);
+  ASSERT_TRUE(farend_handle != NULL);
+  void* handle = WebRtc_CreateDelayEstimator(farend_handle, kLookahead);
+  ASSERT_TRUE(handle != NULL);
+  EXPECT_EQ(kLookahead, WebRtc_lookahead(handle));
+  WebRtc_FreeDelayEstimator(handle);
+  WebRtc_FreeDelayEstimatorFarend(farend_handle);
+}
+
+TEST_F(DelayEstimatorTest, VerifyLookaheadIsSetAndKeptAfterInit) {
+  EXPECT_EQ(kLookahead, WebRtc_lookahead(handle_));
+  EXPECT_EQ(kDifferentLookahead,
+            WebRtc_set_lookahead(handle_, kDifferentLookahead));
+  EXPECT_EQ(kDifferentLookahead, WebRtc_lookahead(handle_));
+  EXPECT_EQ(0, WebRtc_InitDelayEstimatorFarend(farend_handle_));
+  EXPECT_EQ(kDifferentLookahead, WebRtc_lookahead(handle_));
+  EXPECT_EQ(0, WebRtc_InitDelayEstimator(handle_));
+  EXPECT_EQ(kDifferentLookahead, WebRtc_lookahead(handle_));
+}
+
+TEST_F(DelayEstimatorTest, VerifyHistorySizeAtCreate) {
+  EXPECT_EQ(kHistorySize, WebRtc_history_size(handle_));
+}
+
+TEST_F(DelayEstimatorTest, VerifyHistorySizeIsSetAndKeptAfterInit) {
+  EXPECT_EQ(kHistorySize, WebRtc_history_size(handle_));
+  EXPECT_EQ(kDifferentHistorySize,
+            WebRtc_set_history_size(handle_, kDifferentHistorySize));
+  EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(handle_));
+  EXPECT_EQ(0, WebRtc_InitDelayEstimator(handle_));
+  EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(handle_));
+  EXPECT_EQ(0, WebRtc_InitDelayEstimatorFarend(farend_handle_));
+  EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(handle_));
+}
+
+// TODO(bjornv): Add tests for SoftReset...(...).
+
+}  // namespace
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.cc
new file mode 100644
index 0000000000..f907c80a35
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.cc
@@ -0,0 +1,486 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/utility/delay_estimator_wrapper.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "modules/audio_processing/utility/delay_estimator.h"
+#include "modules/audio_processing/utility/delay_estimator_internal.h"
+#include "rtc_base/checks.h"
+
+// Only bit |kBandFirst| through bit |kBandLast| are processed and
+// |kBandFirst| - |kBandLast| must be < 32.
+enum { kBandFirst = 12 };
+enum { kBandLast = 43 };
+
+static __inline uint32_t SetBit(uint32_t in, int pos) {
+  uint32_t mask = (1 << pos);
+  uint32_t out = (in | mask);
+
+  return out;
+}
+
+// Calculates the mean recursively. Same version as WebRtc_MeanEstimatorFix(),
+// but for float.
+//
+// Inputs:
+//    - new_value             : New additional value.
+//    - scale                 : Scale for smoothing (should be less than 1.0).
+//
+// Input/Output:
+//    - mean_value            : Pointer to the mean value for updating.
+//
+static void MeanEstimatorFloat(float new_value,
+                               float scale,
+                               float* mean_value) {
+  RTC_DCHECK_LT(scale, 1.0f);
+  *mean_value += (new_value - *mean_value) * scale;
+}
+
+// Computes the binary spectrum by comparing the input |spectrum| with a
+// |threshold_spectrum|. Float and fixed point versions.
+//
+// Inputs:
+//      - spectrum            : Spectrum of which the binary spectrum should be
+//                              calculated.
+//      - threshold_spectrum  : Threshold spectrum with which the input
+//                              spectrum is compared.
+// Return:
+//      - out                 : Binary spectrum.
+//
+static uint32_t BinarySpectrumFix(const uint16_t* spectrum,
+                                  SpectrumType* threshold_spectrum,
+                                  int q_domain,
+                                  int* threshold_initialized) {
+  int i = kBandFirst;
+  uint32_t out = 0;
+
+  RTC_DCHECK_LT(q_domain, 16);
+
+  if (!(*threshold_initialized)) {
+    // Set the |threshold_spectrum| to half the input |spectrum| as starting
+    // value. This speeds up the convergence.
+    for (i = kBandFirst; i <= kBandLast; i++) {
+      if (spectrum[i] > 0) {
+        // Convert input spectrum from Q(|q_domain|) to Q15.
+        int32_t spectrum_q15 = ((int32_t) spectrum[i]) << (15 - q_domain);
+        threshold_spectrum[i].int32_ = (spectrum_q15 >> 1);
+        *threshold_initialized = 1;
+      }
+    }
+  }
+  for (i = kBandFirst; i <= kBandLast; i++) {
+    // Convert input spectrum from Q(|q_domain|) to Q15.
+    int32_t spectrum_q15 = ((int32_t) spectrum[i]) << (15 - q_domain);
+    // Update the |threshold_spectrum|.
+    WebRtc_MeanEstimatorFix(spectrum_q15, 6, &(threshold_spectrum[i].int32_));
+    // Convert |spectrum| at current frequency bin to a binary value.
+    if (spectrum_q15 > threshold_spectrum[i].int32_) {
+      out = SetBit(out, i - kBandFirst);
+    }
+  }
+
+  return out;
+}
+
+static uint32_t BinarySpectrumFloat(const float* spectrum,
+                                    SpectrumType* threshold_spectrum,
+                                    int* threshold_initialized) {
+  int i = kBandFirst;
+  uint32_t out = 0;
+  const float kScale = 1 / 64.0;
+
+  if (!(*threshold_initialized)) {
+    // Set the |threshold_spectrum| to half the input |spectrum| as starting
+    // value. This speeds up the convergence.
+    for (i = kBandFirst; i <= kBandLast; i++) {
+      if (spectrum[i] > 0.0f) {
+        threshold_spectrum[i].float_ = (spectrum[i] / 2);
+        *threshold_initialized = 1;
+      }
+    }
+  }
+
+  for (i = kBandFirst; i <= kBandLast; i++) {
+    // Update the |threshold_spectrum|.
+    MeanEstimatorFloat(spectrum[i], kScale, &(threshold_spectrum[i].float_));
+    // Convert |spectrum| at current frequency bin to a binary value.
+    if (spectrum[i] > threshold_spectrum[i].float_) {
+      out = SetBit(out, i - kBandFirst);
+    }
+  }
+
+  return out;
+}
+
+void WebRtc_FreeDelayEstimatorFarend(void* handle) {
+  DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle;
+
+  if (handle == NULL) {
+    return;
+  }
+
+  free(self->mean_far_spectrum);
+  self->mean_far_spectrum = NULL;
+
+  WebRtc_FreeBinaryDelayEstimatorFarend(self->binary_farend);
+  self->binary_farend = NULL;
+
+  free(self);
+}
+
+void* WebRtc_CreateDelayEstimatorFarend(int spectrum_size, int history_size) {
+  DelayEstimatorFarend* self = NULL;
+
+  // Check if the sub band used in the delay estimation is small enough to fit
+  // the binary spectra in a uint32_t.
+  static_assert(kBandLast - kBandFirst < 32, "");
+
+  if (spectrum_size >= kBandLast) {
+    self = static_cast<DelayEstimatorFarend*>(
+        malloc(sizeof(DelayEstimatorFarend)));
+  }
+
+  if (self != NULL) {
+    int memory_fail = 0;
+
+    // Allocate memory for the binary far-end spectrum handling.
+    self->binary_farend = WebRtc_CreateBinaryDelayEstimatorFarend(history_size);
+    memory_fail |= (self->binary_farend == NULL);
+
+    // Allocate memory for spectrum buffers.
+    self->mean_far_spectrum =
+        static_cast<SpectrumType*>(malloc(spectrum_size * sizeof(SpectrumType)));
+    memory_fail |= (self->mean_far_spectrum == NULL);
+
+    self->spectrum_size = spectrum_size;
+
+    if (memory_fail) {
+      WebRtc_FreeDelayEstimatorFarend(self);
+      self = NULL;
+    }
+  }
+
+  return self;
+}
+
+int WebRtc_InitDelayEstimatorFarend(void* handle) {
+  DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle;
+
+  if (self == NULL) {
+    return -1;
+  }
+
+  // Initialize far-end part of binary delay estimator.
+  WebRtc_InitBinaryDelayEstimatorFarend(self->binary_farend);
+
+  // Set averaged far and near end spectra to zero.
+  memset(self->mean_far_spectrum, 0,
+         sizeof(SpectrumType) * self->spectrum_size);
+  // Reset initialization indicators.
+  self->far_spectrum_initialized = 0;
+
+  return 0;
+}
+
+void WebRtc_SoftResetDelayEstimatorFarend(void* handle, int delay_shift) {
+  DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle;
+  RTC_DCHECK(self);
+  WebRtc_SoftResetBinaryDelayEstimatorFarend(self->binary_farend, delay_shift);
+}
+
+int WebRtc_AddFarSpectrumFix(void* handle,
+                             const uint16_t* far_spectrum,
+                             int spectrum_size,
+                             int far_q) {
+  DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle;
+  uint32_t binary_spectrum = 0;
+
+  if (self == NULL) {
+    return -1;
+  }
+  if (far_spectrum == NULL) {
+    // Empty far end spectrum.
+    return -1;
+  }
+  if (spectrum_size != self->spectrum_size) {
+    // Data sizes don't match.
+    return -1;
+  }
+  if (far_q > 15) {
+    // If |far_q| is larger than 15 we cannot guarantee no wrap around.
+    return -1;
+  }
+
+  // Get binary spectrum.
+  binary_spectrum = BinarySpectrumFix(far_spectrum, self->mean_far_spectrum,
+                                      far_q, &(self->far_spectrum_initialized));
+  WebRtc_AddBinaryFarSpectrum(self->binary_farend, binary_spectrum);
+
+  return 0;
+}
+
+int WebRtc_AddFarSpectrumFloat(void* handle,
+                               const float* far_spectrum,
+                               int spectrum_size) {
+  DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle;
+  uint32_t binary_spectrum = 0;
+
+  if (self == NULL) {
+    return -1;
+  }
+  if (far_spectrum == NULL) {
+    // Empty far end spectrum.
+    return -1;
+  }
+  if (spectrum_size != self->spectrum_size) {
+    // Data sizes don't match.
+    return -1;
+  }
+
+  // Get binary spectrum.
+  binary_spectrum = BinarySpectrumFloat(far_spectrum, self->mean_far_spectrum,
+                                        &(self->far_spectrum_initialized));
+  WebRtc_AddBinaryFarSpectrum(self->binary_farend, binary_spectrum);
+
+  return 0;
+}
+
+void WebRtc_FreeDelayEstimator(void* handle) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+
+  if (handle == NULL) {
+    return;
+  }
+
+  free(self->mean_near_spectrum);
+  self->mean_near_spectrum = NULL;
+
+  WebRtc_FreeBinaryDelayEstimator(self->binary_handle);
+  self->binary_handle = NULL;
+
+  free(self);
+}
+
+void* WebRtc_CreateDelayEstimator(void* farend_handle, int max_lookahead) {
+  DelayEstimator* self = NULL;
+  DelayEstimatorFarend* farend = (DelayEstimatorFarend*) farend_handle;
+
+  if (farend_handle != NULL) {
+    self = static_cast<DelayEstimator*>(malloc(sizeof(DelayEstimator)));
+  }
+
+  if (self != NULL) {
+    int memory_fail = 0;
+
+    // Allocate memory for the farend spectrum handling.
+    self->binary_handle =
+        WebRtc_CreateBinaryDelayEstimator(farend->binary_farend, max_lookahead);
+    memory_fail |= (self->binary_handle == NULL);
+
+    // Allocate memory for spectrum buffers.
+    self->mean_near_spectrum = static_cast<SpectrumType*>(
+        malloc(farend->spectrum_size * sizeof(SpectrumType)));
+    memory_fail |= (self->mean_near_spectrum == NULL);
+
+    self->spectrum_size = farend->spectrum_size;
+
+    if (memory_fail) {
+      WebRtc_FreeDelayEstimator(self);
+      self = NULL;
+    }
+  }
+
+  return self;
+}
+
+int WebRtc_InitDelayEstimator(void* handle) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+
+  if (self == NULL) {
+    return -1;
+  }
+
+  // Initialize binary delay estimator.
+  WebRtc_InitBinaryDelayEstimator(self->binary_handle);
+
+  // Set averaged far and near end spectra to zero.
+  memset(self->mean_near_spectrum, 0,
+         sizeof(SpectrumType) * self->spectrum_size);
+  // Reset initialization indicators.
+  self->near_spectrum_initialized = 0;
+
+  return 0;
+}
+
+int WebRtc_SoftResetDelayEstimator(void* handle, int delay_shift) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+  RTC_DCHECK(self);
+  return WebRtc_SoftResetBinaryDelayEstimator(self->binary_handle, delay_shift);
+}
+
+int WebRtc_set_history_size(void* handle, int history_size) {
+  DelayEstimator* self = static_cast<DelayEstimator*>(handle);
+
+  if ((self == NULL) || (history_size <= 1)) {
+    return -1;
+  }
+  return WebRtc_AllocateHistoryBufferMemory(self->binary_handle, history_size);
+}
+
+int WebRtc_history_size(const void* handle) {
+  const DelayEstimator* self = static_cast<const DelayEstimator*>(handle);
+
+  if (self == NULL) {
+    return -1;
+  }
+  if (self->binary_handle->farend->history_size !=
+      self->binary_handle->history_size) {
+    // Non matching history sizes.
+    return -1;
+  }
+  return self->binary_handle->history_size;
+}
+
+int WebRtc_set_lookahead(void* handle, int lookahead) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+  RTC_DCHECK(self);
+  RTC_DCHECK(self->binary_handle);
+  if ((lookahead > self->binary_handle->near_history_size - 1) ||
+      (lookahead < 0)) {
+    return -1;
+  }
+  self->binary_handle->lookahead = lookahead;
+  return self->binary_handle->lookahead;
+}
+
+int WebRtc_lookahead(void* handle) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+  RTC_DCHECK(self);
+  RTC_DCHECK(self->binary_handle);
+  return self->binary_handle->lookahead;
+}
+
+int WebRtc_set_allowed_offset(void* handle, int allowed_offset) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+
+  if ((self == NULL) || (allowed_offset < 0)) {
+    return -1;
+  }
+  self->binary_handle->allowed_offset = allowed_offset;
+  return 0;
+}
+
+int WebRtc_get_allowed_offset(const void* handle) {
+  const DelayEstimator* self = (const DelayEstimator*) handle;
+
+  if (self == NULL) {
+    return -1;
+  }
+  return self->binary_handle->allowed_offset;
+}
+
+int WebRtc_enable_robust_validation(void* handle, int enable) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+
+  if (self == NULL) {
+    return -1;
+  }
+  if ((enable < 0) || (enable > 1)) {
+    return -1;
+  }
+  RTC_DCHECK(self->binary_handle);
+  self->binary_handle->robust_validation_enabled = enable;
+  return 0;
+}
+
+int WebRtc_is_robust_validation_enabled(const void* handle) {
+  const DelayEstimator* self = (const DelayEstimator*) handle;
+
+  if (self == NULL) {
+    return -1;
+  }
+  return self->binary_handle->robust_validation_enabled;
+}
+
+int WebRtc_DelayEstimatorProcessFix(void* handle,
+                                    const uint16_t* near_spectrum,
+                                    int spectrum_size,
+                                    int near_q) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+  uint32_t binary_spectrum = 0;
+
+  if (self == NULL) {
+    return -1;
+  }
+  if (near_spectrum == NULL) {
+    // Empty near end spectrum.
+    return -1;
+  }
+  if (spectrum_size != self->spectrum_size) {
+    // Data sizes don't match.
+    return -1;
+  }
+  if (near_q > 15) {
+    // If |near_q| is larger than 15 we cannot guarantee no wrap around.
+    return -1;
+  }
+
+  // Get binary spectra.
+  binary_spectrum = BinarySpectrumFix(near_spectrum,
+                                      self->mean_near_spectrum,
+                                      near_q,
+                                      &(self->near_spectrum_initialized));
+
+  return WebRtc_ProcessBinarySpectrum(self->binary_handle, binary_spectrum);
+}
+
+int WebRtc_DelayEstimatorProcessFloat(void* handle,
+                                      const float* near_spectrum,
+                                      int spectrum_size) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+  uint32_t binary_spectrum = 0;
+
+  if (self == NULL) {
+    return -1;
+  }
+  if (near_spectrum == NULL) {
+    // Empty near end spectrum.
+    return -1;
+  }
+  if (spectrum_size != self->spectrum_size) {
+    // Data sizes don't match.
+    return -1;
+  }
+
+  // Get binary spectrum.
+  binary_spectrum = BinarySpectrumFloat(near_spectrum, self->mean_near_spectrum,
+                                        &(self->near_spectrum_initialized));
+
+  return WebRtc_ProcessBinarySpectrum(self->binary_handle, binary_spectrum);
+}
+
+int WebRtc_last_delay(void* handle) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+
+  if (self == NULL) {
+    return -1;
+  }
+
+  return WebRtc_binary_last_delay(self->binary_handle);
+}
+
+float WebRtc_last_delay_quality(void* handle) {
+  DelayEstimator* self = (DelayEstimator*) handle;
+  RTC_DCHECK(self);
+  return WebRtc_binary_last_delay_quality(self->binary_handle);
+}
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h
new file mode 100644
index 0000000000..6b6e51f82c
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h
@@ -0,0 +1,244 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Performs delay estimation on block by block basis.
+// The return value is  0 - OK and -1 - Error, unless otherwise stated.
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_WRAPPER_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_WRAPPER_H_
+
+#include "typedefs.h"  // NOLINT(build/include)
+
+// Releases the memory allocated by WebRtc_CreateDelayEstimatorFarend(...)
+void WebRtc_FreeDelayEstimatorFarend(void* handle);
+
+// Allocates the memory needed by the far-end part of the delay estimation. The
+// memory needs to be initialized separately through
+// WebRtc_InitDelayEstimatorFarend(...).
+//
+// Inputs:
+//  - spectrum_size     : Size of the spectrum used both in far-end and
+//                        near-end. Used to allocate memory for spectrum
+//                        specific buffers.
+//  - history_size      : The far-end history buffer size. A change in buffer
+//                        size can be forced with WebRtc_set_history_size().
+//                        Note that the maximum delay which can be estimated is
+//                        determined together with WebRtc_set_lookahead().
+//
+// Return value:
+//  - void*             : Created |handle|. If the memory can't be allocated or
+//                        if any of the input parameters are invalid NULL is
+//                        returned.
+void* WebRtc_CreateDelayEstimatorFarend(int spectrum_size, int history_size);
+
+// Initializes the far-end part of the delay estimation instance returned by
+// WebRtc_CreateDelayEstimatorFarend(...)
+int WebRtc_InitDelayEstimatorFarend(void* handle);
+
+// Soft resets the far-end part of the delay estimation instance returned by
+// WebRtc_CreateDelayEstimatorFarend(...).
+// Input:
+//      - delay_shift   : The amount of blocks to shift history buffers.
+void WebRtc_SoftResetDelayEstimatorFarend(void* handle, int delay_shift);
+
+// Adds the far-end spectrum to the far-end history buffer. This spectrum is
+// used as reference when calculating the delay using
+// WebRtc_ProcessSpectrum().
+//
+// Inputs:
+//    - far_spectrum    : Far-end spectrum.
+//    - spectrum_size   : The size of the data arrays (same for both far- and
+//                        near-end).
+//    - far_q           : The Q-domain of the far-end data.
+//
+// Output:
+//    - handle          : Updated far-end instance.
+//
+int WebRtc_AddFarSpectrumFix(void* handle,
+                             const uint16_t* far_spectrum,
+                             int spectrum_size,
+                             int far_q);
+
+// See WebRtc_AddFarSpectrumFix() for description.
+int WebRtc_AddFarSpectrumFloat(void* handle,
+                               const float* far_spectrum,
+                               int spectrum_size);
+
+// Releases the memory allocated by WebRtc_CreateDelayEstimator(...)
+void WebRtc_FreeDelayEstimator(void* handle);
+
+// Allocates the memory needed by the delay estimation. The memory needs to be
+// initialized separately through WebRtc_InitDelayEstimator(...).
+//
+// Inputs:
+//      - farend_handle : Pointer to the far-end part of the delay estimation
+//                        instance created prior to this call using
+//                        WebRtc_CreateDelayEstimatorFarend().
+//
+//                        Note that WebRtc_CreateDelayEstimator does not take
+//                        ownership of |farend_handle|, which has to be torn
+//                        down properly after this instance.
+//
+//      - max_lookahead : Maximum amount of non-causal lookahead allowed. The
+//                        actual amount of lookahead used can be controlled by
+//                        WebRtc_set_lookahead(...). The default |lookahead| is
+//                        set to |max_lookahead| at create time. Use
+//                        WebRtc_set_lookahead(...) before start if a different
+//                        value is desired.
+//
+//                        Using lookahead can detect cases in which a near-end
+//                        signal occurs before the corresponding far-end signal.
+//                        It will delay the estimate for the current block by an
+//                        equal amount, and the returned values will be offset
+//                        by it.
+//
+//                        A value of zero is the typical no-lookahead case.
+//                        This also represents the minimum delay which can be
+//                        estimated.
+//
+//                        Note that the effective range of delay estimates is
+//                        [-|lookahead|,... ,|history_size|-|lookahead|)
+//                        where |history_size| is set through
+//                        WebRtc_set_history_size().
+//
+// Return value:
+//      - void*         : Created |handle|. If the memory can't be allocated or
+//                        if any of the input parameters are invalid NULL is
+//                        returned.
+void* WebRtc_CreateDelayEstimator(void* farend_handle, int max_lookahead);
+
+// Initializes the delay estimation instance returned by
+// WebRtc_CreateDelayEstimator(...)
+int WebRtc_InitDelayEstimator(void* handle);
+
+// Soft resets the delay estimation instance returned by
+// WebRtc_CreateDelayEstimator(...)
+// Input:
+//      - delay_shift   : The amount of blocks to shift history buffers.
+//
+// Return value:
+//      - actual_shifts : The actual number of shifts performed.
+int WebRtc_SoftResetDelayEstimator(void* handle, int delay_shift);
+
+// Sets the effective |history_size| used. Valid values from 2. We simply need
+// at least two delays to compare to perform an estimate. If |history_size| is
+// changed, buffers are reallocated filling in with zeros if necessary.
+// Note that changing the |history_size| affects both buffers in far-end and
+// near-end. Hence it is important to change all DelayEstimators that use the
+// same reference far-end, to the same |history_size| value.
+// Inputs:
+//  - handle            : Pointer to the delay estimation instance.
+//  - history_size      : Effective history size to be used.
+// Return value:
+//  - new_history_size  : The new history size used. If the memory was not able
+//                        to be allocated 0 is returned.
+int WebRtc_set_history_size(void* handle, int history_size);
+
+// Returns the history_size currently used.
+// Input:
+//      - handle        : Pointer to the delay estimation instance.
+int WebRtc_history_size(const void* handle);
+
+// Sets the amount of |lookahead| to use. Valid values are [0, max_lookahead]
+// where |max_lookahead| was set at create time through
+// WebRtc_CreateDelayEstimator(...).
+//
+// Input:
+//      - handle        : Pointer to the delay estimation instance.
+//      - lookahead     : The amount of lookahead to be used.
+//
+// Return value:
+//      - new_lookahead : The actual amount of lookahead set, unless |handle| is
+//                        a NULL pointer or |lookahead| is invalid, for which an
+//                        error is returned.
+int WebRtc_set_lookahead(void* handle, int lookahead);
+
+// Returns the amount of lookahead we currently use.
+// Input:
+//      - handle        : Pointer to the delay estimation instance.
+int WebRtc_lookahead(void* handle);
+
+// Sets the |allowed_offset| used in the robust validation scheme.  If the
+// delay estimator is used in an echo control component, this parameter is
+// related to the filter length.  In principle |allowed_offset| should be set to
+// the echo control filter length minus the expected echo duration, i.e., the
+// delay offset the echo control can handle without quality regression.  The
+// default value, used if not set manually, is zero.  Note that |allowed_offset|
+// has to be non-negative.
+// Inputs:
+//  - handle            : Pointer to the delay estimation instance.
+//  - allowed_offset    : The amount of delay offset, measured in partitions,
+//                        the echo control filter can handle.
+int WebRtc_set_allowed_offset(void* handle, int allowed_offset);
+
+// Returns the |allowed_offset| in number of partitions.
+int WebRtc_get_allowed_offset(const void* handle);
+
+// Enables/Disables a robust validation functionality in the delay estimation.
+// This is by default set to disabled at create time.  The state is preserved
+// over a reset.
+// Inputs:
+//      - handle        : Pointer to the delay estimation instance.
+//      - enable        : Enable (1) or disable (0) this feature.
+int WebRtc_enable_robust_validation(void* handle, int enable);
+
+// Returns 1 if robust validation is enabled and 0 if disabled.
+int WebRtc_is_robust_validation_enabled(const void* handle);
+
+// Estimates and returns the delay between the far-end and near-end blocks. The
+// value will be offset by the lookahead (i.e. the lookahead should be
+// subtracted from the returned value).
+// Inputs:
+//      - handle        : Pointer to the delay estimation instance.
+//      - near_spectrum : Pointer to the near-end spectrum data of the current
+//                        block.
+//      - spectrum_size : The size of the data arrays (same for both far- and
+//                        near-end).
+//      - near_q        : The Q-domain of the near-end data.
+//
+// Output:
+//      - handle        : Updated instance.
+//
+// Return value:
+//      - delay         :  >= 0 - Calculated delay value.
+//                        -1    - Error.
+//                        -2    - Insufficient data for estimation.
+int WebRtc_DelayEstimatorProcessFix(void* handle,
+                                    const uint16_t* near_spectrum,
+                                    int spectrum_size,
+                                    int near_q);
+
+// See WebRtc_DelayEstimatorProcessFix() for description.
+int WebRtc_DelayEstimatorProcessFloat(void* handle,
+                                      const float* near_spectrum,
+                                      int spectrum_size);
+
+// Returns the last calculated delay updated by the function
+// WebRtc_DelayEstimatorProcess(...).
+//
+// Input:
+//      - handle        : Pointer to the delay estimation instance.
+//
+// Return value:
+//      - delay         : >= 0  - Last calculated delay value.
+//                        -1    - Error.
+//                        -2    - Insufficient data for estimation.
+int WebRtc_last_delay(void* handle);
+
+// Returns the estimation quality/probability of the last calculated delay
+// updated by the function WebRtc_DelayEstimatorProcess(...). The estimation
+// quality is a value in the interval [0, 1]. The higher the value, the better
+// the quality.
+//
+// Return value:
+//      - delay_quality : >= 0  - Estimation quality of last calculated delay.
+float WebRtc_last_delay_quality(void* handle);
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_WRAPPER_H_
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.cc
new file mode 100644
index 0000000000..59631117a2
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.cc
@@ -0,0 +1,543 @@
+/*
+ * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+ * Copyright Takuya OOURA, 1996-2001
+ *
+ * You may use, copy, modify and distribute this code for any purpose (include
+ * commercial use) and without fee. Please refer to this package when you modify
+ * this code.
+ *
+ * Changes by the WebRTC authors:
+ *    - Trivial type modifications.
+ *    - Minimal code subset to do rdft of length 128.
+ *    - Optimizations because of known length.
+ *    - Removed the global variables by moving the code in to a class in order
+ *      to make it thread safe.
+ *
+ *  All changes are covered by the WebRTC license and IP grant:
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing//utility/ooura_fft.h"
+
+#include <math.h>
+
+#include "modules/audio_processing/utility/ooura_fft_tables_common.h"
+#include "system_wrappers/include/cpu_features_wrapper.h"
+#include "typedefs.h"  // NOLINT(build/include)
+
+namespace webrtc {
+
+namespace {
+
+#if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON))
+static void cft1st_128_C(float* a) {
+  const int n = 128;
+  int j, k1, k2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  // The processing of the first set of elements was simplified in C to avoid
+  // some operations (multiplication by zero or one, addition of two elements
+  // multiplied by the same weight, ...).
+  x0r = a[0] + a[2];
+  x0i = a[1] + a[3];
+  x1r = a[0] - a[2];
+  x1i = a[1] - a[3];
+  x2r = a[4] + a[6];
+  x2i = a[5] + a[7];
+  x3r = a[4] - a[6];
+  x3i = a[5] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[2] = x1r - x3i;
+  a[3] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+  wk1r = rdft_w[2];
+  x0r = a[8] + a[10];
+  x0i = a[9] + a[11];
+  x1r = a[8] - a[10];
+  x1i = a[9] - a[11];
+  x2r = a[12] + a[14];
+  x2i = a[13] + a[15];
+  x3r = a[12] - a[14];
+  x3i = a[13] - a[15];
+  a[8] = x0r + x2r;
+  a[9] = x0i + x2i;
+  a[12] = x2i - x0i;
+  a[13] = x0r - x2r;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[10] = wk1r * (x0r - x0i);
+  a[11] = wk1r * (x0r + x0i);
+  x0r = x3i + x1r;
+  x0i = x3r - x1i;
+  a[14] = wk1r * (x0i - x0r);
+  a[15] = wk1r * (x0i + x0r);
+  k1 = 0;
+  for (j = 16; j < n; j += 16) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = rdft_w[k1 + 0];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2 + 0];
+    wk1i = rdft_w[k2 + 1];
+    wk3r = rdft_wk3ri_first[k1 + 0];
+    wk3i = rdft_wk3ri_first[k1 + 1];
+    x0r = a[j + 0] + a[j + 2];
+    x0i = a[j + 1] + a[j + 3];
+    x1r = a[j + 0] - a[j + 2];
+    x1i = a[j + 1] - a[j + 3];
+    x2r = a[j + 4] + a[j + 6];
+    x2i = a[j + 5] + a[j + 7];
+    x3r = a[j + 4] - a[j + 6];
+    x3i = a[j + 5] - a[j + 7];
+    a[j + 0] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 4] = wk2r * x0r - wk2i * x0i;
+    a[j + 5] = wk2r * x0i + wk2i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 2] = wk1r * x0r - wk1i * x0i;
+    a[j + 3] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 6] = wk3r * x0r - wk3i * x0i;
+    a[j + 7] = wk3r * x0i + wk3i * x0r;
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
+    wk3r = rdft_wk3ri_second[k1 + 0];
+    wk3i = rdft_wk3ri_second[k1 + 1];
+    x0r = a[j + 8] + a[j + 10];
+    x0i = a[j + 9] + a[j + 11];
+    x1r = a[j + 8] - a[j + 10];
+    x1i = a[j + 9] - a[j + 11];
+    x2r = a[j + 12] + a[j + 14];
+    x2i = a[j + 13] + a[j + 15];
+    x3r = a[j + 12] - a[j + 14];
+    x3i = a[j + 13] - a[j + 15];
+    a[j + 8] = x0r + x2r;
+    a[j + 9] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 12] = -wk2i * x0r - wk2r * x0i;
+    a[j + 13] = -wk2i * x0i + wk2r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 10] = wk1r * x0r - wk1i * x0i;
+    a[j + 11] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 14] = wk3r * x0r - wk3i * x0i;
+    a[j + 15] = wk3r * x0i + wk3i * x0r;
+  }
+}
+
+static void cftmdl_128_C(float* a) {
+  const int l = 8;
+  const int n = 128;
+  const int m = 32;
+  int j0, j1, j2, j3, k, k1, k2, m2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  for (j0 = 0; j0 < l; j0 += 2) {
+    j1 = j0 + 8;
+    j2 = j0 + 16;
+    j3 = j0 + 24;
+    x0r = a[j0 + 0] + a[j1 + 0];
+    x0i = a[j0 + 1] + a[j1 + 1];
+    x1r = a[j0 + 0] - a[j1 + 0];
+    x1i = a[j0 + 1] - a[j1 + 1];
+    x2r = a[j2 + 0] + a[j3 + 0];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2 + 0] - a[j3 + 0];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j0 + 0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j2 + 0] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1 + 0] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3 + 0] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+  wk1r = rdft_w[2];
+  for (j0 = m; j0 < l + m; j0 += 2) {
+    j1 = j0 + 8;
+    j2 = j0 + 16;
+    j3 = j0 + 24;
+    x0r = a[j0 + 0] + a[j1 + 0];
+    x0i = a[j0 + 1] + a[j1 + 1];
+    x1r = a[j0 + 0] - a[j1 + 0];
+    x1i = a[j0 + 1] - a[j1 + 1];
+    x2r = a[j2 + 0] + a[j3 + 0];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2 + 0] - a[j3 + 0];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j0 + 0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j2 + 0] = x2i - x0i;
+    a[j2 + 1] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j1 + 0] = wk1r * (x0r - x0i);
+    a[j1 + 1] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[j3 + 0] = wk1r * (x0i - x0r);
+    a[j3 + 1] = wk1r * (x0i + x0r);
+  }
+  k1 = 0;
+  m2 = 2 * m;
+  for (k = m2; k < n; k += m2) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = rdft_w[k1 + 0];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2 + 0];
+    wk1i = rdft_w[k2 + 1];
+    wk3r = rdft_wk3ri_first[k1 + 0];
+    wk3i = rdft_wk3ri_first[k1 + 1];
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      j1 = j0 + 8;
+      j2 = j0 + 16;
+      j3 = j0 + 24;
+      x0r = a[j0 + 0] + a[j1 + 0];
+      x0i = a[j0 + 1] + a[j1 + 1];
+      x1r = a[j0 + 0] - a[j1 + 0];
+      x1i = a[j0 + 1] - a[j1 + 1];
+      x2r = a[j2 + 0] + a[j3 + 0];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2 + 0] - a[j3 + 0];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j0 + 0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2 + 0] = wk2r * x0r - wk2i * x0i;
+      a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
+    wk3r = rdft_wk3ri_second[k1 + 0];
+    wk3i = rdft_wk3ri_second[k1 + 1];
+    for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
+      j1 = j0 + 8;
+      j2 = j0 + 16;
+      j3 = j0 + 24;
+      x0r = a[j0 + 0] + a[j1 + 0];
+      x0i = a[j0 + 1] + a[j1 + 1];
+      x1r = a[j0 + 0] - a[j1 + 0];
+      x1i = a[j0 + 1] - a[j1 + 1];
+      x2r = a[j2 + 0] + a[j3 + 0];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2 + 0] - a[j3 + 0];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j0 + 0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
+      a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+  }
+}
+
+static void rftfsub_128_C(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_C(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  a[1] = -a[1];
+  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+#endif
+
+
+}  // namespace
+
+OouraFft::OouraFft() {
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  use_sse2_ = (WebRtc_GetCPUInfo(kSSE2) != 0);
+#else
+  use_sse2_ = false;
+#endif
+}
+
+OouraFft::~OouraFft() = default;
+
+void OouraFft::Fft(float* a) const {
+  float xi;
+  bitrv2_128(a);
+  cftfsub_128(a);
+  rftfsub_128(a);
+  xi = a[0] - a[1];
+  a[0] += a[1];
+  a[1] = xi;
+}
+void OouraFft::InverseFft(float* a) const {
+  a[1] = 0.5f * (a[0] - a[1]);
+  a[0] -= a[1];
+  rftbsub_128(a);
+  bitrv2_128(a);
+  cftbsub_128(a);
+}
+
+void OouraFft::cft1st_128(float* a) const {
+#if defined(MIPS_FPU_LE)
+  cft1st_128_mips(a);
+#elif defined(WEBRTC_HAS_NEON)
+  cft1st_128_neon(a);
+#elif defined(WEBRTC_ARCH_X86_FAMILY)
+  if (use_sse2_) {
+    cft1st_128_SSE2(a);
+  } else {
+    cft1st_128_C(a);
+  }
+#else
+  cft1st_128_C(a);
+#endif
+}
+void OouraFft::cftmdl_128(float* a) const {
+#if defined(MIPS_FPU_LE)
+  cftmdl_128_mips(a);
+#elif defined(WEBRTC_HAS_NEON)
+  cftmdl_128_neon(a);
+#elif defined(WEBRTC_ARCH_X86_FAMILY)
+  if (use_sse2_) {
+    cftmdl_128_SSE2(a);
+  } else {
+    cftmdl_128_C(a);
+  }
+#else
+  cftmdl_128_C(a);
+#endif
+}
+void OouraFft::rftfsub_128(float* a) const {
+#if defined(MIPS_FPU_LE)
+  rftfsub_128_mips(a);
+#elif defined(WEBRTC_HAS_NEON)
+  rftfsub_128_neon(a);
+#elif defined(WEBRTC_ARCH_X86_FAMILY)
+  if (use_sse2_) {
+    rftfsub_128_SSE2(a);
+  } else {
+    rftfsub_128_C(a);
+  }
+#else
+  rftfsub_128_C(a);
+#endif
+}
+
+void OouraFft::rftbsub_128(float* a) const {
+#if defined(MIPS_FPU_LE)
+  rftbsub_128_mips(a);
+#elif defined(WEBRTC_HAS_NEON)
+  rftbsub_128_neon(a);
+#elif defined(WEBRTC_ARCH_X86_FAMILY)
+  if (use_sse2_) {
+    rftbsub_128_SSE2(a);
+  } else {
+    rftbsub_128_C(a);
+  }
+#else
+  rftbsub_128_C(a);
+#endif
+}
+
+void OouraFft::cftbsub_128(float* a) const {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+  l = 32;
+
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = -a[j + 1] - a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = -a[j + 1] + a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i - x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i + x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i - x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i + x3r;
+  }
+}
+
+void OouraFft::cftfsub_128(float* a) const {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+  l = 32;
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = a[j + 1] + a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = a[j + 1] - a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+}
+
+void OouraFft::bitrv2_128(float* a) const {
+  /*
+      Following things have been attempted but are no faster:
+      (a) Storing the swap indexes in a LUT (index calculations are done
+          for 'free' while waiting on memory/L1).
+      (b) Consolidate the load/store of two consecutive floats by a 64 bit
+          integer (execution is memory/L1 bound).
+      (c) Do a mix of floats and 64 bit integer to maximize register
+          utilization (execution is memory/L1 bound).
+      (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
+      (e) Hard-coding of the offsets to completely eliminates index
+          calculations.
+  */
+
+  unsigned int j, j1, k, k1;
+  float xr, xi, yr, yi;
+
+  const int ip[4] = {0, 64, 32, 96};
+  for (k = 0; k < 4; k++) {
+    for (j = 0; j < k; j++) {
+      j1 = 2 * j + ip[k];
+      k1 = 2 * k + ip[j];
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+      j1 += 8;
+      k1 += 16;
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+      j1 += 8;
+      k1 -= 8;
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+      j1 += 8;
+      k1 += 16;
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+    }
+    j1 = 2 * k + 8 + ip[k];
+    k1 = j1 + 8;
+    xr = a[j1 + 0];
+    xi = a[j1 + 1];
+    yr = a[k1 + 0];
+    yi = a[k1 + 1];
+    a[j1 + 0] = yr;
+    a[j1 + 1] = yi;
+    a[k1 + 0] = xr;
+    a[k1 + 1] = xi;
+  }
+}
+
+}  // namespace webrtc
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.h
new file mode 100644
index 0000000000..96d57dc908
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
+
+#include "typedefs.h"  // NOLINT(build/include)
+
+namespace webrtc {
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+void cft1st_128_SSE2(float* a);
+void cftmdl_128_SSE2(float* a);
+void rftfsub_128_SSE2(float* a);
+void rftbsub_128_SSE2(float* a);
+#endif
+
+#if defined(MIPS_FPU_LE)
+void cft1st_128_mips(float* a);
+void cftmdl_128_mips(float* a);
+void rftfsub_128_mips(float* a);
+void rftbsub_128_mips(float* a);
+#endif
+
+#if defined(WEBRTC_HAS_NEON)
+void cft1st_128_neon(float* a);
+void cftmdl_128_neon(float* a);
+void rftfsub_128_neon(float* a);
+void rftbsub_128_neon(float* a);
+#endif
+
+class OouraFft {
+ public:
+  OouraFft();
+  ~OouraFft();
+  void Fft(float* a) const;
+  void InverseFft(float* a) const;
+
+ private:
+  void cft1st_128(float* a) const;
+  void cftmdl_128(float* a) const;
+  void rftfsub_128(float* a) const;
+  void rftbsub_128(float* a) const;
+
+  void cftfsub_128(float* a) const;
+  void cftbsub_128(float* a) const;
+  void bitrv2_128(float* a) const;
+  bool use_sse2_;
+};
+
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_mips.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_mips.cc
new file mode 100644
index 0000000000..569e1d7e82
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_mips.cc
@@ -0,0 +1,1185 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/utility/ooura_fft.h"
+
+#include "modules/audio_processing/utility/ooura_fft_tables_common.h"
+#include "typedefs.h"  // NOLINT(build/include)
+
+namespace webrtc {
+
+#if defined(MIPS_FPU_LE)
+void bitrv2_128_mips(float* a) {
+  // n is 128
+  float xr, xi, yr, yi;
+
+  xr = a[8];
+  xi = a[9];
+  yr = a[16];
+  yi = a[17];
+  a[8] = yr;
+  a[9] = yi;
+  a[16] = xr;
+  a[17] = xi;
+
+  xr = a[64];
+  xi = a[65];
+  yr = a[2];
+  yi = a[3];
+  a[64] = yr;
+  a[65] = yi;
+  a[2] = xr;
+  a[3] = xi;
+
+  xr = a[72];
+  xi = a[73];
+  yr = a[18];
+  yi = a[19];
+  a[72] = yr;
+  a[73] = yi;
+  a[18] = xr;
+  a[19] = xi;
+
+  xr = a[80];
+  xi = a[81];
+  yr = a[10];
+  yi = a[11];
+  a[80] = yr;
+  a[81] = yi;
+  a[10] = xr;
+  a[11] = xi;
+
+  xr = a[88];
+  xi = a[89];
+  yr = a[26];
+  yi = a[27];
+  a[88] = yr;
+  a[89] = yi;
+  a[26] = xr;
+  a[27] = xi;
+
+  xr = a[74];
+  xi = a[75];
+  yr = a[82];
+  yi = a[83];
+  a[74] = yr;
+  a[75] = yi;
+  a[82] = xr;
+  a[83] = xi;
+
+  xr = a[32];
+  xi = a[33];
+  yr = a[4];
+  yi = a[5];
+  a[32] = yr;
+  a[33] = yi;
+  a[4] = xr;
+  a[5] = xi;
+
+  xr = a[40];
+  xi = a[41];
+  yr = a[20];
+  yi = a[21];
+  a[40] = yr;
+  a[41] = yi;
+  a[20] = xr;
+  a[21] = xi;
+
+  xr = a[48];
+  xi = a[49];
+  yr = a[12];
+  yi = a[13];
+  a[48] = yr;
+  a[49] = yi;
+  a[12] = xr;
+  a[13] = xi;
+
+  xr = a[56];
+  xi = a[57];
+  yr = a[28];
+  yi = a[29];
+  a[56] = yr;
+  a[57] = yi;
+  a[28] = xr;
+  a[29] = xi;
+
+  xr = a[34];
+  xi = a[35];
+  yr = a[68];
+  yi = a[69];
+  a[34] = yr;
+  a[35] = yi;
+  a[68] = xr;
+  a[69] = xi;
+
+  xr = a[42];
+  xi = a[43];
+  yr = a[84];
+  yi = a[85];
+  a[42] = yr;
+  a[43] = yi;
+  a[84] = xr;
+  a[85] = xi;
+
+  xr = a[50];
+  xi = a[51];
+  yr = a[76];
+  yi = a[77];
+  a[50] = yr;
+  a[51] = yi;
+  a[76] = xr;
+  a[77] = xi;
+
+  xr = a[58];
+  xi = a[59];
+  yr = a[92];
+  yi = a[93];
+  a[58] = yr;
+  a[59] = yi;
+  a[92] = xr;
+  a[93] = xi;
+
+  xr = a[44];
+  xi = a[45];
+  yr = a[52];
+  yi = a[53];
+  a[44] = yr;
+  a[45] = yi;
+  a[52] = xr;
+  a[53] = xi;
+
+  xr = a[96];
+  xi = a[97];
+  yr = a[6];
+  yi = a[7];
+  a[96] = yr;
+  a[97] = yi;
+  a[6] = xr;
+  a[7] = xi;
+
+  xr = a[104];
+  xi = a[105];
+  yr = a[22];
+  yi = a[23];
+  a[104] = yr;
+  a[105] = yi;
+  a[22] = xr;
+  a[23] = xi;
+
+  xr = a[112];
+  xi = a[113];
+  yr = a[14];
+  yi = a[15];
+  a[112] = yr;
+  a[113] = yi;
+  a[14] = xr;
+  a[15] = xi;
+
+  xr = a[120];
+  xi = a[121];
+  yr = a[30];
+  yi = a[31];
+  a[120] = yr;
+  a[121] = yi;
+  a[30] = xr;
+  a[31] = xi;
+
+  xr = a[98];
+  xi = a[99];
+  yr = a[70];
+  yi = a[71];
+  a[98] = yr;
+  a[99] = yi;
+  a[70] = xr;
+  a[71] = xi;
+
+  xr = a[106];
+  xi = a[107];
+  yr = a[86];
+  yi = a[87];
+  a[106] = yr;
+  a[107] = yi;
+  a[86] = xr;
+  a[87] = xi;
+
+  xr = a[114];
+  xi = a[115];
+  yr = a[78];
+  yi = a[79];
+  a[114] = yr;
+  a[115] = yi;
+  a[78] = xr;
+  a[79] = xi;
+
+  xr = a[122];
+  xi = a[123];
+  yr = a[94];
+  yi = a[95];
+  a[122] = yr;
+  a[123] = yi;
+  a[94] = xr;
+  a[95] = xi;
+
+  xr = a[100];
+  xi = a[101];
+  yr = a[38];
+  yi = a[39];
+  a[100] = yr;
+  a[101] = yi;
+  a[38] = xr;
+  a[39] = xi;
+
+  xr = a[108];
+  xi = a[109];
+  yr = a[54];
+  yi = a[55];
+  a[108] = yr;
+  a[109] = yi;
+  a[54] = xr;
+  a[55] = xi;
+
+  xr = a[116];
+  xi = a[117];
+  yr = a[46];
+  yi = a[47];
+  a[116] = yr;
+  a[117] = yi;
+  a[46] = xr;
+  a[47] = xi;
+
+  xr = a[124];
+  xi = a[125];
+  yr = a[62];
+  yi = a[63];
+  a[124] = yr;
+  a[125] = yi;
+  a[62] = xr;
+  a[63] = xi;
+
+  xr = a[110];
+  xi = a[111];
+  yr = a[118];
+  yi = a[119];
+  a[110] = yr;
+  a[111] = yi;
+  a[118] = xr;
+  a[119] = xi;
+}
+
+void cft1st_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14;
+  int a_ptr, p1_rdft, p2_rdft, count;
+  const float* first = rdft_wk3ri_first;
+  const float* second = rdft_wk3ri_second;
+
+  __asm __volatile (
+    ".set       push                                                    \n\t"
+    ".set       noreorder                                               \n\t"
+    // first 8
+    "lwc1       %[f0],        0(%[a])                                   \n\t"
+    "lwc1       %[f1],        4(%[a])                                   \n\t"
+    "lwc1       %[f2],        8(%[a])                                   \n\t"
+    "lwc1       %[f3],        12(%[a])                                  \n\t"
+    "lwc1       %[f4],        16(%[a])                                  \n\t"
+    "lwc1       %[f5],        20(%[a])                                  \n\t"
+    "lwc1       %[f6],        24(%[a])                                  \n\t"
+    "lwc1       %[f7],        28(%[a])                                  \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f2],        %[f1],        %[f4]                       \n\t"
+    "add.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "add.s      %[f4],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "swc1       %[f7],        0(%[a])                                   \n\t"
+    "swc1       %[f8],        16(%[a])                                  \n\t"
+    "swc1       %[f2],        28(%[a])                                  \n\t"
+    "swc1       %[f1],        12(%[a])                                  \n\t"
+    "swc1       %[f4],        4(%[a])                                   \n\t"
+    "swc1       %[f6],        20(%[a])                                  \n\t"
+    "swc1       %[f3],        8(%[a])                                   \n\t"
+    "swc1       %[f0],        24(%[a])                                  \n\t"
+    // second 8
+    "lwc1       %[f0],        32(%[a])                                  \n\t"
+    "lwc1       %[f1],        36(%[a])                                  \n\t"
+    "lwc1       %[f2],        40(%[a])                                  \n\t"
+    "lwc1       %[f3],        44(%[a])                                  \n\t"
+    "lwc1       %[f4],        48(%[a])                                  \n\t"
+    "lwc1       %[f5],        52(%[a])                                  \n\t"
+    "lwc1       %[f6],        56(%[a])                                  \n\t"
+    "lwc1       %[f7],        60(%[a])                                  \n\t"
+    "add.s      %[f8],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "add.s      %[f7],        %[f4],        %[f1]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f1]                       \n\t"
+    "add.s      %[f1],        %[f3],        %[f8]                       \n\t"
+    "sub.s      %[f3],        %[f3],        %[f8]                       \n\t"
+    "sub.s      %[f8],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f6],        %[f2]                       \n\t"
+    "sub.s      %[f6],        %[f2],        %[f6]                       \n\t"
+    "lwc1       %[f9],        8(%[rdft_w])                              \n\t"
+    "sub.s      %[f2],        %[f8],        %[f7]                       \n\t"
+    "add.s      %[f8],        %[f8],        %[f7]                       \n\t"
+    "sub.s      %[f7],        %[f4],        %[f0]                       \n\t"
+    "add.s      %[f4],        %[f4],        %[f0]                       \n\t"
+    // prepare for loop
+    "addiu      %[a_ptr],     %[a],         64                          \n\t"
+    "addiu      %[p1_rdft],   %[rdft_w],    8                           \n\t"
+    "addiu      %[p2_rdft],   %[rdft_w],    16                          \n\t"
+    "addiu      %[count],     $zero,        7                           \n\t"
+    // finish second 8
+    "mul.s      %[f2],        %[f9],        %[f2]                       \n\t"
+    "mul.s      %[f8],        %[f9],        %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f9],        %[f7]                       \n\t"
+    "mul.s      %[f4],        %[f9],        %[f4]                       \n\t"
+    "swc1       %[f1],        32(%[a])                                  \n\t"
+    "swc1       %[f3],        52(%[a])                                  \n\t"
+    "swc1       %[f5],        36(%[a])                                  \n\t"
+    "swc1       %[f6],        48(%[a])                                  \n\t"
+    "swc1       %[f2],        40(%[a])                                  \n\t"
+    "swc1       %[f8],        44(%[a])                                  \n\t"
+    "swc1       %[f7],        56(%[a])                                  \n\t"
+    "swc1       %[f4],        60(%[a])                                  \n\t"
+    // loop
+   "1:                                                                  \n\t"
+    "lwc1       %[f0],        0(%[a_ptr])                               \n\t"
+    "lwc1       %[f1],        4(%[a_ptr])                               \n\t"
+    "lwc1       %[f2],        8(%[a_ptr])                               \n\t"
+    "lwc1       %[f3],        12(%[a_ptr])                              \n\t"
+    "lwc1       %[f4],        16(%[a_ptr])                              \n\t"
+    "lwc1       %[f5],        20(%[a_ptr])                              \n\t"
+    "lwc1       %[f6],        24(%[a_ptr])                              \n\t"
+    "lwc1       %[f7],        28(%[a_ptr])                              \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "lwc1       %[f10],       4(%[p1_rdft])                             \n\t"
+    "lwc1       %[f11],       0(%[p2_rdft])                             \n\t"
+    "lwc1       %[f12],       4(%[p2_rdft])                             \n\t"
+    "lwc1       %[f13],       8(%[first])                               \n\t"
+    "lwc1       %[f14],       12(%[first])                              \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "swc1       %[f7],        0(%[a_ptr])                               \n\t"
+    "swc1       %[f2],        4(%[a_ptr])                               \n\t"
+    "mul.s      %[f4],        %[f9],        %[f8]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f8],        %[f10],       %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f2],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f14],       %[f3]                       \n\t"
+    "nmsub.s    %[f4],        %[f4],        %[f10],       %[f6]         \n\t"
+    "madd.s     %[f8],        %[f8],        %[f9],        %[f6]         \n\t"
+    "nmsub.s    %[f7],        %[f7],        %[f12],       %[f5]         \n\t"
+    "madd.s     %[f0],        %[f0],        %[f11],       %[f5]         \n\t"
+    "nmsub.s    %[f2],        %[f2],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f3],        %[f3],        %[f13],       %[f1]         \n\t"
+#else
+    "mul.s      %[f7],        %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f6],        %[f9],        %[f6]                       \n\t"
+    "mul.s      %[f8],        %[f10],       %[f8]                       \n\t"
+    "mul.s      %[f2],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f5],        %[f12],       %[f5]                       \n\t"
+    "mul.s      %[f0],        %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f12],       %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "mul.s      %[f1],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f3],        %[f14],       %[f3]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f7]                       \n\t"
+    "add.s      %[f8],        %[f6],        %[f8]                       \n\t"
+    "sub.s      %[f7],        %[f2],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f11],       %[f0]                       \n\t"
+    "sub.s      %[f2],        %[f12],       %[f1]                       \n\t"
+    "add.s      %[f3],        %[f13],       %[f3]                       \n\t"
+#endif
+    "swc1       %[f4],        16(%[a_ptr])                              \n\t"
+    "swc1       %[f8],        20(%[a_ptr])                              \n\t"
+    "swc1       %[f7],        8(%[a_ptr])                               \n\t"
+    "swc1       %[f0],        12(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        24(%[a_ptr])                              \n\t"
+    "swc1       %[f3],        28(%[a_ptr])                              \n\t"
+    "lwc1       %[f0],        32(%[a_ptr])                              \n\t"
+    "lwc1       %[f1],        36(%[a_ptr])                              \n\t"
+    "lwc1       %[f2],        40(%[a_ptr])                              \n\t"
+    "lwc1       %[f3],        44(%[a_ptr])                              \n\t"
+    "lwc1       %[f4],        48(%[a_ptr])                              \n\t"
+    "lwc1       %[f5],        52(%[a_ptr])                              \n\t"
+    "lwc1       %[f6],        56(%[a_ptr])                              \n\t"
+    "lwc1       %[f7],        60(%[a_ptr])                              \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "lwc1       %[f11],       8(%[p2_rdft])                             \n\t"
+    "lwc1       %[f12],       12(%[p2_rdft])                            \n\t"
+    "lwc1       %[f13],       8(%[second])                              \n\t"
+    "lwc1       %[f14],       12(%[second])                             \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f2],        %[f8]                       \n\t"
+    "add.s      %[f2],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f3],        %[f6]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "swc1       %[f7],        32(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        36(%[a_ptr])                              \n\t"
+    "mul.s      %[f4],        %[f10],       %[f8]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f10],       %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f2],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "madd.s     %[f4],        %[f4],        %[f9],        %[f6]         \n\t"
+    "nmsub.s    %[f10],       %[f10],       %[f9],        %[f8]         \n\t"
+    "nmsub.s    %[f7],        %[f7],        %[f12],       %[f5]         \n\t"
+    "madd.s     %[f11],       %[f11],       %[f12],       %[f0]         \n\t"
+    "nmsub.s    %[f2],        %[f2],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f13],       %[f13],       %[f14],       %[f3]         \n\t"
+#else
+    "mul.s      %[f2],        %[f9],        %[f6]                       \n\t"
+    "mul.s      %[f10],       %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f9],        %[f9],        %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f8],        %[f12],       %[f5]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f12],       %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f5],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "mul.s      %[f14],       %[f14],       %[f3]                       \n\t"
+    "add.s      %[f4],        %[f4],        %[f2]                       \n\t"
+    "sub.s      %[f10],       %[f10],       %[f9]                       \n\t"
+    "sub.s      %[f7],        %[f7],        %[f8]                       \n\t"
+    "add.s      %[f11],       %[f11],       %[f12]                      \n\t"
+    "sub.s      %[f2],        %[f5],        %[f0]                       \n\t"
+    "add.s      %[f13],       %[f13],       %[f14]                      \n\t"
+#endif
+    "swc1       %[f4],        48(%[a_ptr])                              \n\t"
+    "swc1       %[f10],       52(%[a_ptr])                              \n\t"
+    "swc1       %[f7],        40(%[a_ptr])                              \n\t"
+    "swc1       %[f11],       44(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        56(%[a_ptr])                              \n\t"
+    "swc1       %[f13],       60(%[a_ptr])                              \n\t"
+    "addiu      %[count],     %[count],     -1                          \n\t"
+    "lwc1       %[f9],        8(%[p1_rdft])                             \n\t"
+    "addiu      %[a_ptr],     %[a_ptr],     64                          \n\t"
+    "addiu      %[p1_rdft],   %[p1_rdft],   8                           \n\t"
+    "addiu      %[p2_rdft],   %[p2_rdft],   16                          \n\t"
+    "addiu      %[first],     %[first],     8                           \n\t"
+    "bgtz       %[count],     1b                                        \n\t"
+    " addiu     %[second],    %[second],    8                           \n\t"
+    ".set       pop                                                     \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+      [f12] "=&f" (f12), [f13] "=&f" (f13), [f14] "=&f" (f14),
+      [a_ptr] "=&r" (a_ptr), [p1_rdft] "=&r" (p1_rdft), [first] "+r" (first),
+      [p2_rdft] "=&r" (p2_rdft), [count] "=&r" (count), [second] "+r" (second)
+    : [a] "r" (a), [rdft_w] "r" (rdft_w)
+    : "memory"
+  );
+}
+
+void cftmdl_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14;
+  int tmp_a, count;
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         0               \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "add.s      %[f8],      %[f0],        %[f2]           \n\t"
+    "sub.s      %[f0],      %[f0],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f4],        %[f6]           \n\t"
+    "sub.s      %[f4],      %[f4],        %[f6]           \n\t"
+    "add.s      %[f6],      %[f1],        %[f3]           \n\t"
+    "sub.s      %[f1],      %[f1],        %[f3]           \n\t"
+    "add.s      %[f3],      %[f5],        %[f7]           \n\t"
+    "sub.s      %[f5],      %[f5],        %[f7]           \n\t"
+    "add.s      %[f7],      %[f8],        %[f2]           \n\t"
+    "sub.s      %[f8],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f1],        %[f4]           \n\t"
+    "sub.s      %[f1],      %[f1],        %[f4]           \n\t"
+    "add.s      %[f4],      %[f6],        %[f3]           \n\t"
+    "sub.s      %[f6],      %[f6],        %[f3]           \n\t"
+    "sub.s      %[f3],      %[f0],        %[f5]           \n\t"
+    "add.s      %[f0],      %[f0],        %[f5]           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f8],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f2],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f1],      100(%[tmp_a])                 \n\t"
+    "swc1       %[f4],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f6],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f0],      96(%[tmp_a])                  \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+  f9 = rdft_w[2];
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         128             \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "sub.s      %[f8],      %[f0],        %[f2]           \n\t"
+    "add.s      %[f0],      %[f0],        %[f2]           \n\t"
+    "sub.s      %[f2],      %[f5],        %[f7]           \n\t"
+    "add.s      %[f5],      %[f5],        %[f7]           \n\t"
+    "sub.s      %[f7],      %[f1],        %[f3]           \n\t"
+    "add.s      %[f1],      %[f1],        %[f3]           \n\t"
+    "sub.s      %[f3],      %[f4],        %[f6]           \n\t"
+    "add.s      %[f4],      %[f4],        %[f6]           \n\t"
+    "sub.s      %[f6],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f8],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f5],        %[f1]           \n\t"
+    "sub.s      %[f5],      %[f5],        %[f1]           \n\t"
+    "add.s      %[f1],      %[f3],        %[f7]           \n\t"
+    "sub.s      %[f3],      %[f3],        %[f7]           \n\t"
+    "add.s      %[f7],      %[f0],        %[f4]           \n\t"
+    "sub.s      %[f0],      %[f0],        %[f4]           \n\t"
+    "sub.s      %[f4],      %[f6],        %[f1]           \n\t"
+    "add.s      %[f6],      %[f6],        %[f1]           \n\t"
+    "sub.s      %[f1],      %[f3],        %[f8]           \n\t"
+    "add.s      %[f3],      %[f3],        %[f8]           \n\t"
+    "mul.s      %[f4],      %[f4],        %[f9]           \n\t"
+    "mul.s      %[f6],      %[f6],        %[f9]           \n\t"
+    "mul.s      %[f1],      %[f1],        %[f9]           \n\t"
+    "mul.s      %[f3],      %[f3],        %[f9]           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f5],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f0],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f4],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f6],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f1],      96(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      100(%[tmp_a])                 \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a), [f9] "f" (f9)
+    : "memory"
+  );
+  f10 = rdft_w[3];
+  f11 = rdft_w[4];
+  f12 = rdft_w[5];
+  f13 = rdft_wk3ri_first[2];
+  f14 = rdft_wk3ri_first[3];
+
+  __asm __volatile (
+    ".set       push                                                    \n\t"
+    ".set       noreorder                                               \n\t"
+    "addiu      %[tmp_a],     %[a],         256                         \n\t"
+    "addiu      %[count],     $zero,        4                           \n\t"
+   "1:                                                                  \n\t"
+    "addiu      %[count],     %[count],     -1                          \n\t"
+    "lwc1       %[f0],        0(%[tmp_a])                               \n\t"
+    "lwc1       %[f2],        32(%[tmp_a])                              \n\t"
+    "lwc1       %[f4],        64(%[tmp_a])                              \n\t"
+    "lwc1       %[f6],        96(%[tmp_a])                              \n\t"
+    "lwc1       %[f1],        4(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],        36(%[tmp_a])                              \n\t"
+    "lwc1       %[f5],        68(%[tmp_a])                              \n\t"
+    "lwc1       %[f7],        100(%[tmp_a])                             \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f4],        %[f6],        %[f3]                       \n\t"
+    "add.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "swc1       %[f8],        0(%[tmp_a])                               \n\t"
+    "swc1       %[f6],        4(%[tmp_a])                               \n\t"
+    "mul.s      %[f5],        %[f9],        %[f7]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f7],        %[f10],       %[f7]                       \n\t"
+    "mul.s      %[f8],        %[f11],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f12],       %[f3]                       \n\t"
+    "mul.s      %[f6],        %[f13],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f0]                       \n\t"
+    "nmsub.s    %[f5],        %[f5],        %[f10],       %[f4]         \n\t"
+    "madd.s     %[f7],        %[f7],        %[f9],        %[f4]         \n\t"
+    "nmsub.s    %[f8],        %[f8],        %[f12],       %[f2]         \n\t"
+    "madd.s     %[f3],        %[f3],        %[f11],       %[f2]         \n\t"
+    "nmsub.s    %[f6],        %[f6],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f0],        %[f0],        %[f13],       %[f1]         \n\t"
+    "swc1       %[f5],        64(%[tmp_a])                              \n\t"
+    "swc1       %[f7],        68(%[tmp_a])                              \n\t"
+#else
+    "mul.s      %[f8],        %[f10],       %[f4]                       \n\t"
+    "mul.s      %[f4],        %[f9],        %[f4]                       \n\t"
+    "mul.s      %[f7],        %[f10],       %[f7]                       \n\t"
+    "mul.s      %[f6],        %[f11],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f12],       %[f3]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f8]                       \n\t"
+    "mul.s      %[f8],        %[f12],       %[f2]                       \n\t"
+    "mul.s      %[f2],        %[f11],       %[f2]                       \n\t"
+    "add.s      %[f7],        %[f4],        %[f7]                       \n\t"
+    "mul.s      %[f4],        %[f13],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f0]                       \n\t"
+    "sub.s      %[f8],        %[f6],        %[f8]                       \n\t"
+    "mul.s      %[f6],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f1],        %[f13],       %[f1]                       \n\t"
+    "add.s      %[f3],        %[f2],        %[f3]                       \n\t"
+    "swc1       %[f5],        64(%[tmp_a])                              \n\t"
+    "swc1       %[f7],        68(%[tmp_a])                              \n\t"
+    "sub.s      %[f6],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f0],        %[f1],        %[f0]                       \n\t"
+#endif
+    "swc1       %[f8],        32(%[tmp_a])                              \n\t"
+    "swc1       %[f3],        36(%[tmp_a])                              \n\t"
+    "swc1       %[f6],        96(%[tmp_a])                              \n\t"
+    "swc1       %[f0],        100(%[tmp_a])                             \n\t"
+    "bgtz       %[count],     1b                                        \n\t"
+    " addiu     %[tmp_a],     %[tmp_a],     8                           \n\t"
+    ".set       pop                                                     \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a),  [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11),
+      [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14)
+    : "memory"
+  );
+  f11 = rdft_w[6];
+  f12 = rdft_w[7];
+  f13 = rdft_wk3ri_second[2];
+  f14 = rdft_wk3ri_second[3];
+  __asm __volatile (
+    ".set       push                                                       \n\t"
+    ".set       noreorder                                                  \n\t"
+    "addiu      %[tmp_a],       %[a],           384                        \n\t"
+    "addiu      %[count],       $zero,          4                          \n\t"
+   "1:                                                                     \n\t"
+    "addiu      %[count],       %[count],       -1                         \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "lwc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],          36(%[tmp_a])                               \n\t"
+    "lwc1       %[f4],          64(%[tmp_a])                               \n\t"
+    "lwc1       %[f5],          68(%[tmp_a])                               \n\t"
+    "lwc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "lwc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "add.s      %[f8],          %[f0],          %[f2]                      \n\t"
+    "sub.s      %[f0],          %[f0],          %[f2]                      \n\t"
+    "add.s      %[f2],          %[f4],          %[f6]                      \n\t"
+    "sub.s      %[f4],          %[f4],          %[f6]                      \n\t"
+    "add.s      %[f6],          %[f1],          %[f3]                      \n\t"
+    "sub.s      %[f1],          %[f1],          %[f3]                      \n\t"
+    "add.s      %[f3],          %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f5],          %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f7],          %[f2],          %[f8]                      \n\t"
+    "add.s      %[f2],          %[f2],          %[f8]                      \n\t"
+    "add.s      %[f8],          %[f1],          %[f4]                      \n\t"
+    "sub.s      %[f1],          %[f1],          %[f4]                      \n\t"
+    "sub.s      %[f4],          %[f3],          %[f6]                      \n\t"
+    "add.s      %[f3],          %[f3],          %[f6]                      \n\t"
+    "sub.s      %[f6],          %[f0],          %[f5]                      \n\t"
+    "add.s      %[f0],          %[f0],          %[f5]                      \n\t"
+    "swc1       %[f2],          0(%[tmp_a])                                \n\t"
+    "swc1       %[f3],          4(%[tmp_a])                                \n\t"
+    "mul.s      %[f5],          %[f10],         %[f7]                      \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f7],          %[f9],          %[f7]                      \n\t"
+    "mul.s      %[f2],          %[f12],         %[f8]                      \n\t"
+    "mul.s      %[f8],          %[f11],         %[f8]                      \n\t"
+    "mul.s      %[f3],          %[f14],         %[f1]                      \n\t"
+    "mul.s      %[f1],          %[f13],         %[f1]                      \n\t"
+    "madd.s     %[f5],          %[f5],          %[f9],       %[f4]         \n\t"
+    "msub.s     %[f7],          %[f7],          %[f10],      %[f4]         \n\t"
+    "msub.s     %[f2],          %[f2],          %[f11],      %[f6]         \n\t"
+    "madd.s     %[f8],          %[f8],          %[f12],      %[f6]         \n\t"
+    "msub.s     %[f3],          %[f3],          %[f13],      %[f0]         \n\t"
+    "madd.s     %[f1],          %[f1],          %[f14],      %[f0]         \n\t"
+    "swc1       %[f5],          64(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          68(%[tmp_a])                               \n\t"
+#else
+    "mul.s      %[f2],          %[f9],          %[f4]                      \n\t"
+    "mul.s      %[f4],          %[f10],         %[f4]                      \n\t"
+    "mul.s      %[f7],          %[f9],          %[f7]                      \n\t"
+    "mul.s      %[f3],          %[f11],         %[f6]                      \n\t"
+    "mul.s      %[f6],          %[f12],         %[f6]                      \n\t"
+    "add.s      %[f5],          %[f5],          %[f2]                      \n\t"
+    "sub.s      %[f7],          %[f4],          %[f7]                      \n\t"
+    "mul.s      %[f2],          %[f12],         %[f8]                      \n\t"
+    "mul.s      %[f8],          %[f11],         %[f8]                      \n\t"
+    "mul.s      %[f4],          %[f14],         %[f1]                      \n\t"
+    "mul.s      %[f1],          %[f13],         %[f1]                      \n\t"
+    "sub.s      %[f2],          %[f3],          %[f2]                      \n\t"
+    "mul.s      %[f3],          %[f13],         %[f0]                      \n\t"
+    "mul.s      %[f0],          %[f14],         %[f0]                      \n\t"
+    "add.s      %[f8],          %[f8],          %[f6]                      \n\t"
+    "swc1       %[f5],          64(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          68(%[tmp_a])                               \n\t"
+    "sub.s      %[f3],          %[f3],          %[f4]                      \n\t"
+    "add.s      %[f1],          %[f1],          %[f0]                      \n\t"
+#endif
+    "swc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "swc1       %[f8],          36(%[tmp_a])                               \n\t"
+    "swc1       %[f3],          96(%[tmp_a])                               \n\t"
+    "swc1       %[f1],          100(%[tmp_a])                              \n\t"
+    "bgtz       %[count],       1b                                         \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],       8                          \n\t"
+    ".set       pop                                                        \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a), [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11),
+      [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14)
+    : "memory"
+  );
+}
+
+void cftfsub_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8;
+  int tmp_a, count;
+
+  cft1st_128_mips(a);
+  cftmdl_128_mips(a);
+
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],       %[a],         0           \n\t"
+    "addiu      %[count],       $zero,        16          \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],       %[count],     -1          \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])               \n\t"
+    "lwc1       %[f2],          128(%[tmp_a])             \n\t"
+    "lwc1       %[f4],          256(%[tmp_a])             \n\t"
+    "lwc1       %[f6],          384(%[tmp_a])             \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])               \n\t"
+    "lwc1       %[f3],          132(%[tmp_a])             \n\t"
+    "lwc1       %[f5],          260(%[tmp_a])             \n\t"
+    "lwc1       %[f7],          388(%[tmp_a])             \n\t"
+    "add.s      %[f8],          %[f0],        %[f2]       \n\t"
+    "sub.s      %[f0],          %[f0],        %[f2]       \n\t"
+    "add.s      %[f2],          %[f4],        %[f6]       \n\t"
+    "sub.s      %[f4],          %[f4],        %[f6]       \n\t"
+    "add.s      %[f6],          %[f1],        %[f3]       \n\t"
+    "sub.s      %[f1],          %[f1],        %[f3]       \n\t"
+    "add.s      %[f3],          %[f5],        %[f7]       \n\t"
+    "sub.s      %[f5],          %[f5],        %[f7]       \n\t"
+    "add.s      %[f7],          %[f8],        %[f2]       \n\t"
+    "sub.s      %[f8],          %[f8],        %[f2]       \n\t"
+    "add.s      %[f2],          %[f1],        %[f4]       \n\t"
+    "sub.s      %[f1],          %[f1],        %[f4]       \n\t"
+    "add.s      %[f4],          %[f6],        %[f3]       \n\t"
+    "sub.s      %[f6],          %[f6],        %[f3]       \n\t"
+    "sub.s      %[f3],          %[f0],        %[f5]       \n\t"
+    "add.s      %[f0],          %[f0],        %[f5]       \n\t"
+    "swc1       %[f7],          0(%[tmp_a])               \n\t"
+    "swc1       %[f8],          256(%[tmp_a])             \n\t"
+    "swc1       %[f2],          132(%[tmp_a])             \n\t"
+    "swc1       %[f1],          388(%[tmp_a])             \n\t"
+    "swc1       %[f4],          4(%[tmp_a])               \n\t"
+    "swc1       %[f6],          260(%[tmp_a])             \n\t"
+    "swc1       %[f3],          128(%[tmp_a])             \n\t"
+    "swc1       %[f0],          384(%[tmp_a])             \n\t"
+    "bgtz       %[count],       1b                        \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],   8             \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+void cftbsub_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8;
+  int tmp_a, count;
+
+  cft1st_128_mips(a);
+  cftmdl_128_mips(a);
+
+  __asm __volatile (
+    ".set       push                                        \n\t"
+    ".set       noreorder                                   \n\t"
+    "addiu      %[tmp_a],   %[a],           0               \n\t"
+    "addiu      %[count],   $zero,          16              \n\t"
+   "1:                                                      \n\t"
+    "addiu      %[count],   %[count],       -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                     \n\t"
+    "lwc1       %[f2],      128(%[tmp_a])                   \n\t"
+    "lwc1       %[f4],      256(%[tmp_a])                   \n\t"
+    "lwc1       %[f6],      384(%[tmp_a])                   \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                     \n\t"
+    "lwc1       %[f3],      132(%[tmp_a])                   \n\t"
+    "lwc1       %[f5],      260(%[tmp_a])                   \n\t"
+    "lwc1       %[f7],      388(%[tmp_a])                   \n\t"
+    "add.s      %[f8],      %[f0],          %[f2]           \n\t"
+    "sub.s      %[f0],      %[f0],          %[f2]           \n\t"
+    "add.s      %[f2],      %[f4],          %[f6]           \n\t"
+    "sub.s      %[f4],      %[f4],          %[f6]           \n\t"
+    "add.s      %[f6],      %[f1],          %[f3]           \n\t"
+    "sub.s      %[f1],      %[f3],          %[f1]           \n\t"
+    "add.s      %[f3],      %[f5],          %[f7]           \n\t"
+    "sub.s      %[f5],      %[f5],          %[f7]           \n\t"
+    "add.s      %[f7],      %[f8],          %[f2]           \n\t"
+    "sub.s      %[f8],      %[f8],          %[f2]           \n\t"
+    "sub.s      %[f2],      %[f1],          %[f4]           \n\t"
+    "add.s      %[f1],      %[f1],          %[f4]           \n\t"
+    "add.s      %[f4],      %[f3],          %[f6]           \n\t"
+    "sub.s      %[f6],      %[f3],          %[f6]           \n\t"
+    "sub.s      %[f3],      %[f0],          %[f5]           \n\t"
+    "add.s      %[f0],      %[f0],          %[f5]           \n\t"
+    "neg.s      %[f4],      %[f4]                           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                     \n\t"
+    "swc1       %[f8],      256(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      132(%[tmp_a])                   \n\t"
+    "swc1       %[f1],      388(%[tmp_a])                   \n\t"
+    "swc1       %[f6],      260(%[tmp_a])                   \n\t"
+    "swc1       %[f3],      128(%[tmp_a])                   \n\t"
+    "swc1       %[f0],      384(%[tmp_a])                   \n\t"
+    "swc1       %[f4],       4(%[tmp_a])                     \n\t"
+    "bgtz       %[count],   1b                              \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],       8               \n\t"
+    ".set       pop                                         \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+void rftfsub_128_mips(float* a) {
+  const float* c = rdft_w + 32;
+  const float f0 = 0.5f;
+  float* a1 = &a[2];
+  float* a2 = &a[126];
+  const float* c1 = &c[1];
+  const float* c2 = &c[31];
+  float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+  int count;
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "addiu     %[count],    $zero,        15                    \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "sub.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "add.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "nmsub.s   %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "madd.s    %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f3],        %[f6]                 \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "addiu     %[a1],       %[a1],        8                     \n\t"
+    "addiu     %[a2],       %[a2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "addiu     %[c2],       %[c2],        -4                    \n\t"
+   "1:                                                          \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "lwc1      %[f10],      -4(%[c2])                           \n\t"
+    "lwc1      %[f11],      8(%[a1])                            \n\t"
+    "lwc1      %[f12],      -8(%[a2])                           \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "sub.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "add.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "nmsub.s   %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "madd.s    %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f10],      %[f0],        %[f10]                \n\t"
+    "sub.s     %[f5],       %[f11],       %[f12]                \n\t"
+    "add.s     %[f7],       %[f13],       %[f14]                \n\t"
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f3],        %[f6]                 \n\t"
+    "mul.s     %[f8],       %[f10],       %[f5]                 \n\t"
+    "mul.s     %[f10],      %[f10],       %[f7]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f9],       %[f15],       %[f7]                 \n\t"
+    "mul.s     %[f15],      %[f15],       %[f5]                 \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f8],       %[f8],        %[f9]                 \n\t"
+    "add.s     %[f10],      %[f10],       %[f15]                \n\t"
+#else
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "nmsub.s   %[f8],       %[f8],        %[f15],     %[f7]     \n\t"
+    "madd.s    %[f10],      %[f10],       %[f15],     %[f5]     \n\t"
+#endif
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "sub.s     %[f11],      %[f11],       %[f8]                 \n\t"
+    "add.s     %[f12],      %[f12],       %[f8]                 \n\t"
+    "sub.s     %[f13],      %[f13],       %[f10]                \n\t"
+    "sub.s     %[f14],      %[f14],       %[f10]                \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "swc1      %[f11],      8(%[a1])                            \n\t"
+    "swc1      %[f12],      -8(%[a2])                           \n\t"
+    "swc1      %[f13],      12(%[a1])                           \n\t"
+    "swc1      %[f14],      -4(%[a2])                           \n\t"
+    "addiu     %[a1],       %[a1],        16                    \n\t"
+    "addiu     %[count],    %[count],     -1                    \n\t"
+    "bgtz      %[count],    1b                                  \n\t"
+    " addiu    %[a2],       %[a2],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2),
+      [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4),
+      [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+      [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12),
+      [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15),
+      [count] "=&r" (count)
+    : [f0] "f" (f0)
+    : "memory"
+  );
+}
+
+void rftbsub_128_mips(float* a) {
+  const float *c = rdft_w + 32;
+  const float f0 = 0.5f;
+  float* a1 = &a[2];
+  float* a2 = &a[126];
+  const float* c1 = &c[1];
+  const float* c2 = &c[31];
+  float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+  int count;
+
+  a[1] = -a[1];
+  a[65] = -a[65];
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "addiu     %[count],    $zero,        15                    \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "add.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "sub.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "madd.s    %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "nmsub.s   %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f6],        %[f3]                 \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "addiu     %[a1],       %[a1],        8                     \n\t"
+    "addiu     %[a2],       %[a2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "addiu     %[c2],       %[c2],        -4                    \n\t"
+   "1:                                                          \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "lwc1      %[f10],      -4(%[c2])                           \n\t"
+    "lwc1      %[f11],      8(%[a1])                            \n\t"
+    "lwc1      %[f12],      -8(%[a2])                           \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "add.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "sub.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "madd.s    %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "nmsub.s   %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f10],      %[f0],        %[f10]                \n\t"
+    "sub.s     %[f5],       %[f11],       %[f12]                \n\t"
+    "add.s     %[f7],       %[f13],       %[f14]                \n\t"
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f6],        %[f3]                 \n\t"
+    "mul.s     %[f8],       %[f10],       %[f5]                 \n\t"
+    "mul.s     %[f10],      %[f10],       %[f7]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f9],       %[f15],       %[f7]                 \n\t"
+    "mul.s     %[f15],      %[f15],       %[f5]                 \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "add.s     %[f8],       %[f8],        %[f9]                 \n\t"
+    "sub.s     %[f10],      %[f10],       %[f15]                \n\t"
+#else
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "madd.s    %[f8],       %[f8],        %[f15],     %[f7]     \n\t"
+    "nmsub.s   %[f10],      %[f10],       %[f15],     %[f5]     \n\t"
+#endif
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "sub.s     %[f11],      %[f11],       %[f8]                 \n\t"
+    "add.s     %[f12],      %[f12],       %[f8]                 \n\t"
+    "sub.s     %[f13],      %[f10],       %[f13]                \n\t"
+    "sub.s     %[f14],      %[f10],       %[f14]                \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "swc1      %[f11],      8(%[a1])                            \n\t"
+    "swc1      %[f12],      -8(%[a2])                           \n\t"
+    "swc1      %[f13],      12(%[a1])                           \n\t"
+    "swc1      %[f14],      -4(%[a2])                           \n\t"
+    "addiu     %[a1],       %[a1],        16                    \n\t"
+    "addiu     %[count],    %[count],     -1                    \n\t"
+    "bgtz      %[count],    1b                                  \n\t"
+    " addiu    %[a2],       %[a2],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2),
+      [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4),
+      [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+      [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12),
+      [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15),
+      [count] "=&r" (count)
+    : [f0] "f" (f0)
+    : "memory"
+  );
+}
+#endif
+
+}  // namespace webrtc
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_neon.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_neon.cc
new file mode 100644
index 0000000000..401387a643
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_neon.cc
@@ -0,0 +1,352 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The rdft AEC algorithm, neon version of speed-critical functions.
+ *
+ * Based on the sse2 version.
+ */
+
+#include "modules/audio_processing/utility/ooura_fft.h"
+
+#include <arm_neon.h>
+
+#include "modules/audio_processing/utility/ooura_fft_tables_common.h"
+#include "modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"
+
+namespace webrtc {
+
+#if defined(WEBRTC_HAS_NEON)
+void cft1st_128_neon(float* a) {
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+    float32x4_t a00v = vld1q_f32(&a[j + 0]);
+    float32x4_t a04v = vld1q_f32(&a[j + 4]);
+    float32x4_t a08v = vld1q_f32(&a[j + 8]);
+    float32x4_t a12v = vld1q_f32(&a[j + 12]);
+    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
+    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
+    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
+    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
+    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
+    float32x4_t x0v = vaddq_f32(a01v, a23v);
+    const float32x4_t x1v = vsubq_f32(a01v, a23v);
+    const float32x4_t x2v = vaddq_f32(a45v, a67v);
+    const float32x4_t x3v = vsubq_f32(a45v, a67v);
+    const float32x4_t x3w = vrev64q_f32(x3v);
+    float32x4_t x0w;
+    a01v = vaddq_f32(x0v, x2v);
+    x0v = vsubq_f32(x0v, x2v);
+    x0w = vrev64q_f32(x0v);
+    a45v = vmulq_f32(wk2rv, x0v);
+    a45v = vmlaq_f32(a45v, wk2iv, x0w);
+    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a23v = vmulq_f32(wk1rv, x0v);
+    a23v = vmlaq_f32(a23v, wk1iv, x0w);
+    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a67v = vmulq_f32(wk3rv, x0v);
+    a67v = vmlaq_f32(a67v, wk3iv, x0w);
+    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
+    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
+    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
+    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
+    vst1q_f32(&a[j + 0], a00v);
+    vst1q_f32(&a[j + 4], a04v);
+    vst1q_f32(&a[j + 8], a08v);
+    vst1q_f32(&a[j + 12], a12v);
+  }
+}
+
+void cftmdl_128_neon(float* a) {
+  int j;
+  const int l = 8;
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
+
+  for (j = 0; j < l; j += 2) {
+    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+    const float32x4_t x1_x3_add =
+        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x4_t x1_x3_sub =
+        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
+    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
+    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
+    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
+    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
+    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
+    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
+    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
+    const float32x4_t xx1_rev = vrev64q_f32(xx1);
+    const float32x4_t yy4_rev = vrev64q_f32(yy4);
+
+    vst1_f32(&a[j + 0], vget_low_f32(xx0));
+    vst1_f32(&a[j + 32], vget_high_f32(xx0));
+    vst1_f32(&a[j + 16], vget_low_f32(xx1));
+    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
+
+    a[j + 48] = -a[j + 48];
+
+    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
+    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
+    vst1_f32(&a[j + 40], vget_low_f32(yy4));
+    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
+  }
+
+  {
+    const int k = 64;
+    const int k1 = 2;
+    const int k2 = 2 * k1;
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
+    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
+    for (j = k; j < l + k; j += 2) {
+      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+      const float32x4_t x1_x3_add =
+          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      const float32x4_t x1_x3_sub =
+          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
+      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
+      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
+      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
+      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
+      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
+
+      vst1_f32(&a[j + 0], vget_low_f32(xx));
+      vst1_f32(&a[j + 32], vget_high_f32(xx));
+      vst1_f32(&a[j + 16], vget_low_f32(xx4));
+      vst1_f32(&a[j + 48], vget_high_f32(xx4));
+      vst1_f32(&a[j + 8], vget_low_f32(xx12));
+      vst1_f32(&a[j + 40], vget_high_f32(xx12));
+      vst1_f32(&a[j + 24], vget_low_f32(xx22));
+      vst1_f32(&a[j + 56], vget_high_f32(xx22));
+    }
+  }
+}
+
+__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
+  // A B C D -> C D A B
+  const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
+  // C D A B -> D C B A
+  return vrev64q_f32(rev);
+}
+
+void rftfsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  // Vectorized code (four at once).
+  // Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vsubq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vaddq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
+    // Shuffle in right order and store.
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr - wki * xi;
+    const float yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+void rftbsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
+    // Shuffle in right order and store.
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr + wki * xi;
+    const float yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+#endif
+
+}  // namespace webrtc
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc
new file mode 100644
index 0000000000..48a05c3bc2
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc
@@ -0,0 +1,438 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing//utility/ooura_fft.h"
+
+#include <emmintrin.h>
+
+#include "modules/audio_processing/utility/ooura_fft_tables_common.h"
+#include "modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"
+
+namespace webrtc {
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+
+namespace {
+// These intrinsics were unavailable before VS 2008.
+// TODO(andrew): move to a common file.
+#if defined(_MSC_VER) && _MSC_VER < 1500
+static __inline __m128 _mm_castsi128_ps(__m128i a) {
+  return *(__m128*)&a;
+}
+static __inline __m128i _mm_castps_si128(__m128 a) {
+  return *(__m128i*)&a;
+}
+#endif
+
+}  // namespace
+
+void cft1st_128_SSE2(float* a) {
+  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+    __m128 a00v = _mm_loadu_ps(&a[j + 0]);
+    __m128 a04v = _mm_loadu_ps(&a[j + 4]);
+    __m128 a08v = _mm_loadu_ps(&a[j + 8]);
+    __m128 a12v = _mm_loadu_ps(&a[j + 12]);
+    __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
+    __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
+
+    const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
+    __m128 x0v = _mm_add_ps(a01v, a23v);
+    const __m128 x1v = _mm_sub_ps(a01v, a23v);
+    const __m128 x2v = _mm_add_ps(a45v, a67v);
+    const __m128 x3v = _mm_sub_ps(a45v, a67v);
+    __m128 x0w;
+    a01v = _mm_add_ps(x0v, x2v);
+    x0v = _mm_sub_ps(x0v, x2v);
+    x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+    {
+      const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
+      const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
+      a45v = _mm_add_ps(a45_0v, a45_1v);
+    }
+    {
+      __m128 a23_0v, a23_1v;
+      const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
+      x0v = _mm_add_ps(x1v, x3s);
+      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+      a23_0v = _mm_mul_ps(wk1rv, x0v);
+      a23_1v = _mm_mul_ps(wk1iv, x0w);
+      a23v = _mm_add_ps(a23_0v, a23_1v);
+
+      x0v = _mm_sub_ps(x1v, x3s);
+      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+    }
+    {
+      const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
+      const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
+      a67v = _mm_add_ps(a67_0v, a67_1v);
+    }
+
+    a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
+    a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
+    a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
+    a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
+    _mm_storeu_ps(&a[j + 0], a00v);
+    _mm_storeu_ps(&a[j + 4], a04v);
+    _mm_storeu_ps(&a[j + 8], a08v);
+    _mm_storeu_ps(&a[j + 12], a12v);
+  }
+}
+
+void cftmdl_128_SSE2(float* a) {
+  const int l = 8;
+  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
+  int j0;
+
+  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
+  for (j0 = 0; j0 < l; j0 += 2) {
+    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
+    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
+    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+    const __m128 a_00_32 =
+        _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
+                       _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 a_08_40 =
+        _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
+                       _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+    const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+
+    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+    const __m128 a_16_48 =
+        _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
+                       _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 a_24_56 =
+        _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
+                       _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+    const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+
+    const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+
+    const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
+        _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
+    const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+    const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+    const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+
+    const __m128 yy0 =
+        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
+    const __m128 yy1 =
+        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
+    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
+    const __m128 yy3 = _mm_add_ps(yy0, yy2);
+    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 32],
+        _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 48],
+        _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
+    a[j0 + 48] = -a[j0 + 48];
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
+    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 56],
+        _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
+  }
+
+  {
+    int k = 64;
+    int k1 = 2;
+    int k2 = 2 * k1;
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
+    wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
+      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
+      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+      const __m128 a_00_32 =
+          _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
+                         _MM_SHUFFLE(1, 0, 1, 0));
+      const __m128 a_08_40 =
+          _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
+                         _MM_SHUFFLE(1, 0, 1, 0));
+      __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+
+      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+      const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+      const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+      const __m128 a_16_48 =
+          _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
+                         _MM_SHUFFLE(1, 0, 1, 0));
+      const __m128 a_24_56 =
+          _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
+                         _MM_SHUFFLE(1, 0, 1, 0));
+      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+
+      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
+      const __m128 xx3 = _mm_mul_ps(
+          wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
+                                                    _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx4 = _mm_add_ps(xx2, xx3);
+
+      const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
+          _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
+      const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+      const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+
+      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
+      const __m128 xx11 = _mm_mul_ps(
+          wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
+                                                    _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx12 = _mm_add_ps(xx10, xx11);
+
+      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
+      const __m128 xx21 = _mm_mul_ps(
+          wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
+                                                    _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx22 = _mm_add_ps(xx20, xx21);
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 32],
+          _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 48],
+          _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 40],
+          _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 56],
+          _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
+    }
+  }
+}
+
+void rftfsub_128_SSE2(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
+                                                          0.5f};
+  const __m128 mm_half = _mm_load_ps(k_half);
+
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
+    const __m128 wkr_ =
+        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
+    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
+    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
+    // Calculate 'x'.
+    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
+    const __m128 b_ = _mm_mul_ps(wki_, xi_);
+    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
+    const __m128 d_ = _mm_mul_ps(wki_, xr_);
+    const __m128 yr_ = _mm_sub_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_add_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                            // Update 'a'.
+                                            //    a[j2 + 0] -= yr;
+                                            //    a[j2 + 1] -= yi;
+                                            //    a[k2 + 0] += yr;
+    //    a[k2 + 1] -= yi;
+    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
+    const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
+    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
+    const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
+    // Shuffle in right order and store.
+    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
+    //   2,   3,   4,   5,
+    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
+    //   6,   7,   8,   9,
+    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
+    // 122, 123, 120, 121,
+    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
+    // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(
+        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(
+        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
+    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
+    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
+  }
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+void rftbsub_128_SSE2(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
+                                                          0.5f};
+  const __m128 mm_half = _mm_load_ps(k_half);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
+    const __m128 wkr_ =
+        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
+    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
+    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
+    // Calculate 'x'.
+    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr + wki * xi;
+    //    yi = wkr * xi - wki * xr;
+    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
+    const __m128 b_ = _mm_mul_ps(wki_, xi_);
+    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
+    const __m128 d_ = _mm_mul_ps(wki_, xr_);
+    const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                            // Update 'a'.
+                                            //    a[j2 + 0] = a[j2 + 0] - yr;
+                                            //    a[j2 + 1] = yi - a[j2 + 1];
+                                            //    a[k2 + 0] = yr + a[k2 + 0];
+    //    a[k2 + 1] = yi - a[k2 + 1];
+    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
+    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
+    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
+    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
+    // Shuffle in right order and store.
+    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
+    //   2,   3,   4,   5,
+    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
+    //   6,   7,   8,   9,
+    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
+    // 122, 123, 120, 121,
+    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
+    // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(
+        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(
+        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
+    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
+    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
+  }
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+#endif
+
+}  // namespace webrtc
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h
new file mode 100644
index 0000000000..47d076ea2a
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
+
+#include "modules/audio_processing/utility/ooura_fft.h"
+
+namespace webrtc {
+
+// This tables used to be computed at run-time. For example, refer to:
+// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
+// to see the initialization code.
+// Constants shared by all paths (C, SSE2, NEON).
+const float rdft_w[64] = {
+    1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, 0.9238795638f,
+    0.3826834559f, 0.3826834559f, 0.9238795638f, 0.9807852507f, 0.1950903237f,
+    0.5555702448f, 0.8314695954f, 0.8314695954f, 0.5555702448f, 0.1950903237f,
+    0.9807852507f, 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
+    0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, 0.9569403529f,
+    0.2902846634f, 0.4713967443f, 0.8819212914f, 0.7730104327f, 0.6343933344f,
+    0.0980171412f, 0.9951847196f, 0.7071067691f, 0.4993977249f, 0.4975923598f,
+    0.4945882559f, 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
+    0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, 0.4157347977f,
+    0.4016037583f, 0.3865052164f, 0.3704755902f, 0.3535533845f, 0.3357794881f,
+    0.3171966672f, 0.2978496552f, 0.2777851224f, 0.2570513785f, 0.2356983721f,
+    0.2137775421f, 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
+    0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
+};
+
+// Constants used by the C and MIPS paths.
+const float rdft_wk3ri_first[16] = {
+    1.000000000f, 0.000000000f, 0.382683456f,  0.923879564f,
+    0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
+    0.956940353f, 0.290284693f, 0.098017156f,  0.995184720f,
+    0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
+};
+const float rdft_wk3ri_second[16] = {
+    -0.707106769f, 0.707106769f,  -0.923879564f, -0.382683456f,
+    -0.980785251f, 0.195090353f,  -0.555570245f, -0.831469536f,
+    -0.881921172f, 0.471396863f,  -0.773010492f, -0.634393334f,
+    -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
+};
+
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h
new file mode 100644
index 0000000000..1c44ae7197
--- /dev/null
+++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
+
+#include "modules/audio_processing/utility/ooura_fft.h"
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN16_BEG __declspec(align(16))
+#define ALIGN16_END
+#else /* gcc or icc */
+#define ALIGN16_BEG
+#define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+namespace webrtc {
+
+// These tables used to be computed at run-time. For example, refer to:
+// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
+// to see the initialization code.
+#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
+// Constants used by SSE2 and NEON but initialized in the C path.
+const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
+
+ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
+    1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
+    0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
+    0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
+    0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
+    0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
+    0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
+    0.098017141f, 0.098017141f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
+    1.000000000f,  1.000000000f,  -0.000000000f, -0.000000000f, 0.707106769f,
+    0.707106769f,  -0.707106769f, -0.707106769f, 0.923879564f,  0.923879564f,
+    -0.382683456f, -0.382683456f, 0.382683456f,  0.382683456f,  -0.923879564f,
+    -0.923879564f, 0.980785251f,  0.980785251f,  -0.195090324f, -0.195090324f,
+    0.555570245f,  0.555570245f,  -0.831469595f, -0.831469595f, 0.831469595f,
+    0.831469595f,  -0.555570245f, -0.555570245f, 0.195090324f,  0.195090324f,
+    -0.980785251f, -0.980785251f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
+    1.000000000f,  1.000000000f,  -0.707106769f, -0.707106769f, 0.382683456f,
+    0.382683456f,  -0.923879564f, -0.923879564f, 0.831469536f,  0.831469536f,
+    -0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
+    -0.555570245f, 0.956940353f,  0.956940353f,  -0.881921172f, -0.881921172f,
+    0.098017156f,  0.098017156f,  -0.773010492f, -0.773010492f, 0.634393334f,
+    0.634393334f,  -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
+    -0.290284693f, -0.290284693f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
+    -0.000000000f, 0.000000000f,  -0.707106769f, 0.707106769f,  -0.382683456f,
+    0.382683456f,  -0.923879564f, 0.923879564f,  -0.195090324f, 0.195090324f,
+    -0.831469595f, 0.831469595f,  -0.555570245f, 0.555570245f,  -0.980785251f,
+    0.980785251f,  -0.098017141f, 0.098017141f,  -0.773010433f, 0.773010433f,
+    -0.471396744f, 0.471396744f,  -0.956940353f, 0.956940353f,  -0.290284663f,
+    0.290284663f,  -0.881921291f, 0.881921291f,  -0.634393334f, 0.634393334f,
+    -0.995184720f, 0.995184720f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
+    -0.000000000f, 0.000000000f,  -1.000000000f, 1.000000000f,  -0.707106769f,
+    0.707106769f,  -0.707106769f, 0.707106769f,  -0.382683456f, 0.382683456f,
+    -0.923879564f, 0.923879564f,  -0.923879564f, 0.923879564f,  -0.382683456f,
+    0.382683456f,  -0.195090324f, 0.195090324f,  -0.980785251f, 0.980785251f,
+    -0.831469595f, 0.831469595f,  -0.555570245f, 0.555570245f,  -0.555570245f,
+    0.555570245f,  -0.831469595f, 0.831469595f,  -0.980785251f, 0.980785251f,
+    -0.195090324f, 0.195090324f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
+    -0.000000000f, 0.000000000f,  -0.707106769f, 0.707106769f,  -0.923879564f,
+    0.923879564f,  0.382683456f,  -0.382683456f, -0.555570245f, 0.555570245f,
+    -0.195090353f, 0.195090353f,  -0.980785251f, 0.980785251f,  0.831469536f,
+    -0.831469536f, -0.290284693f, 0.290284693f,  -0.471396863f, 0.471396863f,
+    -0.995184720f, 0.995184720f,  0.634393334f,  -0.634393334f, -0.773010492f,
+    0.773010492f,  0.098017156f,  -0.098017156f, -0.881921172f, 0.881921172f,
+    0.956940353f,  -0.956940353f,
+};
+ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
+    0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
+};
+#endif
+
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_