diff options
Diffstat (limited to 'third_party/libwebrtc/webrtc/modules/audio_processing/utility')
16 files changed, 5239 insertions, 0 deletions
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.cc new file mode 100644 index 0000000000..3d766929c6 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.cc @@ -0,0 +1,53 @@ +/* + * Copyright 2016 The WebRTC Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/utility/block_mean_calculator.h" + +#include "rtc_base/checks.h" + +namespace webrtc { + +BlockMeanCalculator::BlockMeanCalculator(size_t block_length) + : block_length_(block_length), + count_(0), + sum_(0.0), + mean_(0.0) { + RTC_DCHECK(block_length_ != 0); +} + +void BlockMeanCalculator::Reset() { + Clear(); + mean_ = 0.0; +} + +void BlockMeanCalculator::AddValue(float value) { + sum_ += value; + ++count_; + if (count_ == block_length_) { + mean_ = sum_ / block_length_; + Clear(); + } +} + +bool BlockMeanCalculator::EndOfBlock() const { + return count_ == 0; +} + +float BlockMeanCalculator::GetLatestMean() const { + return mean_; +} + +// Flush all samples added. +void BlockMeanCalculator::Clear() { + count_ = 0; + sum_ = 0.0; +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.h new file mode 100644 index 0000000000..cfa7cfbeba --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator.h @@ -0,0 +1,52 @@ +/* + * Copyright 2016 The WebRTC Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_BLOCK_MEAN_CALCULATOR_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_BLOCK_MEAN_CALCULATOR_H_ + +#include <stddef.h> + +#include "rtc_base/constructormagic.h" + +namespace webrtc { + +// BlockMeanCalculator calculates the mean of a block of values. Values are +// added one after another, and the mean is updated at the end of every block. +class BlockMeanCalculator { + public: + explicit BlockMeanCalculator(size_t block_length); + + // Reset. + void Reset(); + + // Add one value to the sequence. + void AddValue(float value); + + // Return whether the latest added value was at the end of a block. + bool EndOfBlock() const; + + // Return the latest mean. + float GetLatestMean() const; + + private: + // Clear all values added. + void Clear(); + + const size_t block_length_; + size_t count_; + float sum_; + float mean_; + + RTC_DISALLOW_COPY_AND_ASSIGN(BlockMeanCalculator); +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_BLOCK_MEAN_CALCULATOR_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator_unittest.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator_unittest.cc new file mode 100644 index 0000000000..1f4ebf1b67 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/block_mean_calculator_unittest.cc @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/utility/block_mean_calculator.h" +#include "test/gtest.h" + +namespace webrtc { + +TEST(MeanCalculatorTest, Correctness) { + const size_t kBlockLength = 10; + BlockMeanCalculator mean_calculator(kBlockLength); + size_t i = 0; + float reference = 0.0; + + for (; i < kBlockLength - 1; ++i) { + mean_calculator.AddValue(static_cast<float>(i)); + EXPECT_FALSE(mean_calculator.EndOfBlock()); + } + mean_calculator.AddValue(static_cast<float>(i++)); + EXPECT_TRUE(mean_calculator.EndOfBlock()); + + for (; i < 3 * kBlockLength; ++i) { + const bool end_of_block = i % kBlockLength == 0; + if (end_of_block) { + // Sum of (i - kBlockLength) ... (i - 1) + reference = i - 0.5 * (1 + kBlockLength); + } + EXPECT_EQ(mean_calculator.EndOfBlock(), end_of_block); + EXPECT_EQ(reference, mean_calculator.GetLatestMean()); + mean_calculator.AddValue(static_cast<float>(i)); + } +} + +TEST(MeanCalculatorTest, Reset) { + const size_t kBlockLength = 10; + BlockMeanCalculator mean_calculator(kBlockLength); + for (size_t i = 0; i < kBlockLength - 1; ++i) { + mean_calculator.AddValue(static_cast<float>(i)); + } + mean_calculator.Reset(); + size_t i = 0; + for (; i < kBlockLength - 1; ++i) { + mean_calculator.AddValue(static_cast<float>(i)); + EXPECT_FALSE(mean_calculator.EndOfBlock()); + } + mean_calculator.AddValue(static_cast<float>(i)); + EXPECT_TRUE(mean_calculator.EndOfBlock()); + EXPECT_EQ(mean_calculator.GetLatestMean(), 0.5 * (kBlockLength - 1)); +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.cc new file mode 100644 index 0000000000..871b541651 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.cc @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/utility/delay_estimator.h" + +#include <stdlib.h> +#include <string.h> +#include <algorithm> + +#include "rtc_base/checks.h" + +// Number of right shifts for scaling is linearly depending on number of bits in +// the far-end binary spectrum. +static const int kShiftsAtZero = 13; // Right shifts at zero binary spectrum. +static const int kShiftsLinearSlope = 3; + +static const int32_t kProbabilityOffset = 1024; // 2 in Q9. +static const int32_t kProbabilityLowerLimit = 8704; // 17 in Q9. +static const int32_t kProbabilityMinSpread = 2816; // 5.5 in Q9. + +// Robust validation settings +static const float kHistogramMax = 3000.f; +static const float kLastHistogramMax = 250.f; +static const float kMinHistogramThreshold = 1.5f; +static const int kMinRequiredHits = 10; +static const int kMaxHitsWhenPossiblyNonCausal = 10; +static const int kMaxHitsWhenPossiblyCausal = 1000; +static const float kQ14Scaling = 1.f / (1 << 14); // Scaling by 2^14 to get Q0. +static const float kFractionSlope = 0.05f; +static const float kMinFractionWhenPossiblyCausal = 0.5f; +static const float kMinFractionWhenPossiblyNonCausal = 0.25f; + +// Counts and returns number of bits of a 32-bit word. +static int BitCount(uint32_t u32) { + uint32_t tmp = u32 - ((u32 >> 1) & 033333333333) - + ((u32 >> 2) & 011111111111); + tmp = ((tmp + (tmp >> 3)) & 030707070707); + tmp = (tmp + (tmp >> 6)); + tmp = (tmp + (tmp >> 12) + (tmp >> 24)) & 077; + + return ((int) tmp); +} + +// Compares the |binary_vector| with all rows of the |binary_matrix| and counts +// per row the number of times they have the same value. +// +// Inputs: +// - binary_vector : binary "vector" stored in a long +// - binary_matrix : binary "matrix" stored as a vector of long +// - matrix_size : size of binary "matrix" +// +// Output: +// - bit_counts : "Vector" stored as a long, containing for each +// row the number of times the matrix row and the +// input vector have the same value +// +static void BitCountComparison(uint32_t binary_vector, + const uint32_t* binary_matrix, + int matrix_size, + int32_t* bit_counts) { + int n = 0; + + // Compare |binary_vector| with all rows of the |binary_matrix| + for (; n < matrix_size; n++) { + bit_counts[n] = (int32_t) BitCount(binary_vector ^ binary_matrix[n]); + } +} + +// Collects necessary statistics for the HistogramBasedValidation(). This +// function has to be called prior to calling HistogramBasedValidation(). The +// statistics updated and used by the HistogramBasedValidation() are: +// 1. the number of |candidate_hits|, which states for how long we have had the +// same |candidate_delay| +// 2. the |histogram| of candidate delays over time. This histogram is +// weighted with respect to a reliability measure and time-varying to cope +// with possible delay shifts. +// For further description see commented code. +// +// Inputs: +// - candidate_delay : The delay to validate. +// - valley_depth_q14 : The cost function has a valley/minimum at the +// |candidate_delay| location. |valley_depth_q14| is the +// cost function difference between the minimum and +// maximum locations. The value is in the Q14 domain. +// - valley_level_q14 : Is the cost function value at the minimum, in Q14. +static void UpdateRobustValidationStatistics(BinaryDelayEstimator* self, + int candidate_delay, + int32_t valley_depth_q14, + int32_t valley_level_q14) { + const float valley_depth = valley_depth_q14 * kQ14Scaling; + float decrease_in_last_set = valley_depth; + const int max_hits_for_slow_change = (candidate_delay < self->last_delay) ? + kMaxHitsWhenPossiblyNonCausal : kMaxHitsWhenPossiblyCausal; + int i = 0; + + RTC_DCHECK_EQ(self->history_size, self->farend->history_size); + // Reset |candidate_hits| if we have a new candidate. + if (candidate_delay != self->last_candidate_delay) { + self->candidate_hits = 0; + self->last_candidate_delay = candidate_delay; + } + self->candidate_hits++; + + // The |histogram| is updated differently across the bins. + // 1. The |candidate_delay| histogram bin is increased with the + // |valley_depth|, which is a simple measure of how reliable the + // |candidate_delay| is. The histogram is not increased above + // |kHistogramMax|. + self->histogram[candidate_delay] += valley_depth; + if (self->histogram[candidate_delay] > kHistogramMax) { + self->histogram[candidate_delay] = kHistogramMax; + } + // 2. The histogram bins in the neighborhood of |candidate_delay| are + // unaffected. The neighborhood is defined as x + {-2, -1, 0, 1}. + // 3. The histogram bins in the neighborhood of |last_delay| are decreased + // with |decrease_in_last_set|. This value equals the difference between + // the cost function values at the locations |candidate_delay| and + // |last_delay| until we reach |max_hits_for_slow_change| consecutive hits + // at the |candidate_delay|. If we exceed this amount of hits the + // |candidate_delay| is a "potential" candidate and we start decreasing + // these histogram bins more rapidly with |valley_depth|. + if (self->candidate_hits < max_hits_for_slow_change) { + decrease_in_last_set = (self->mean_bit_counts[self->compare_delay] - + valley_level_q14) * kQ14Scaling; + } + // 4. All other bins are decreased with |valley_depth|. + // TODO(bjornv): Investigate how to make this loop more efficient. Split up + // the loop? Remove parts that doesn't add too much. + for (i = 0; i < self->history_size; ++i) { + int is_in_last_set = (i >= self->last_delay - 2) && + (i <= self->last_delay + 1) && (i != candidate_delay); + int is_in_candidate_set = (i >= candidate_delay - 2) && + (i <= candidate_delay + 1); + self->histogram[i] -= decrease_in_last_set * is_in_last_set + + valley_depth * (!is_in_last_set && !is_in_candidate_set); + // 5. No histogram bin can go below 0. + if (self->histogram[i] < 0) { + self->histogram[i] = 0; + } + } +} + +// Validates the |candidate_delay|, estimated in WebRtc_ProcessBinarySpectrum(), +// based on a mix of counting concurring hits with a modified histogram +// of recent delay estimates. In brief a candidate is valid (returns 1) if it +// is the most likely according to the histogram. There are a couple of +// exceptions that are worth mentioning: +// 1. If the |candidate_delay| < |last_delay| it can be that we are in a +// non-causal state, breaking a possible echo control algorithm. Hence, we +// open up for a quicker change by allowing the change even if the +// |candidate_delay| is not the most likely one according to the histogram. +// 2. There's a minimum number of hits (kMinRequiredHits) and the histogram +// value has to reached a minimum (kMinHistogramThreshold) to be valid. +// 3. The action is also depending on the filter length used for echo control. +// If the delay difference is larger than what the filter can capture, we +// also move quicker towards a change. +// For further description see commented code. +// +// Input: +// - candidate_delay : The delay to validate. +// +// Return value: +// - is_histogram_valid : 1 - The |candidate_delay| is valid. +// 0 - Otherwise. +static int HistogramBasedValidation(const BinaryDelayEstimator* self, + int candidate_delay) { + float fraction = 1.f; + float histogram_threshold = self->histogram[self->compare_delay]; + const int delay_difference = candidate_delay - self->last_delay; + int is_histogram_valid = 0; + + // The histogram based validation of |candidate_delay| is done by comparing + // the |histogram| at bin |candidate_delay| with a |histogram_threshold|. + // This |histogram_threshold| equals a |fraction| of the |histogram| at bin + // |last_delay|. The |fraction| is a piecewise linear function of the + // |delay_difference| between the |candidate_delay| and the |last_delay| + // allowing for a quicker move if + // i) a potential echo control filter can not handle these large differences. + // ii) keeping |last_delay| instead of updating to |candidate_delay| could + // force an echo control into a non-causal state. + // We further require the histogram to have reached a minimum value of + // |kMinHistogramThreshold|. In addition, we also require the number of + // |candidate_hits| to be more than |kMinRequiredHits| to remove spurious + // values. + + // Calculate a comparison histogram value (|histogram_threshold|) that is + // depending on the distance between the |candidate_delay| and |last_delay|. + // TODO(bjornv): How much can we gain by turning the fraction calculation + // into tables? + if (delay_difference > self->allowed_offset) { + fraction = 1.f - kFractionSlope * (delay_difference - self->allowed_offset); + fraction = (fraction > kMinFractionWhenPossiblyCausal ? fraction : + kMinFractionWhenPossiblyCausal); + } else if (delay_difference < 0) { + fraction = kMinFractionWhenPossiblyNonCausal - + kFractionSlope * delay_difference; + fraction = (fraction > 1.f ? 1.f : fraction); + } + histogram_threshold *= fraction; + histogram_threshold = (histogram_threshold > kMinHistogramThreshold ? + histogram_threshold : kMinHistogramThreshold); + + is_histogram_valid = + (self->histogram[candidate_delay] >= histogram_threshold) && + (self->candidate_hits > kMinRequiredHits); + + return is_histogram_valid; +} + +// Performs a robust validation of the |candidate_delay| estimated in +// WebRtc_ProcessBinarySpectrum(). The algorithm takes the +// |is_instantaneous_valid| and the |is_histogram_valid| and combines them +// into a robust validation. The HistogramBasedValidation() has to be called +// prior to this call. +// For further description on how the combination is done, see commented code. +// +// Inputs: +// - candidate_delay : The delay to validate. +// - is_instantaneous_valid : The instantaneous validation performed in +// WebRtc_ProcessBinarySpectrum(). +// - is_histogram_valid : The histogram based validation. +// +// Return value: +// - is_robust : 1 - The candidate_delay is valid according to a +// combination of the two inputs. +// : 0 - Otherwise. +static int RobustValidation(const BinaryDelayEstimator* self, + int candidate_delay, + int is_instantaneous_valid, + int is_histogram_valid) { + int is_robust = 0; + + // The final robust validation is based on the two algorithms; 1) the + // |is_instantaneous_valid| and 2) the histogram based with result stored in + // |is_histogram_valid|. + // i) Before we actually have a valid estimate (|last_delay| == -2), we say + // a candidate is valid if either algorithm states so + // (|is_instantaneous_valid| OR |is_histogram_valid|). + is_robust = (self->last_delay < 0) && + (is_instantaneous_valid || is_histogram_valid); + // ii) Otherwise, we need both algorithms to be certain + // (|is_instantaneous_valid| AND |is_histogram_valid|) + is_robust |= is_instantaneous_valid && is_histogram_valid; + // iii) With one exception, i.e., the histogram based algorithm can overrule + // the instantaneous one if |is_histogram_valid| = 1 and the histogram + // is significantly strong. + is_robust |= is_histogram_valid && + (self->histogram[candidate_delay] > self->last_delay_histogram); + + return is_robust; +} + +void WebRtc_FreeBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self) { + + if (self == NULL) { + return; + } + + free(self->binary_far_history); + self->binary_far_history = NULL; + + free(self->far_bit_counts); + self->far_bit_counts = NULL; + + free(self); +} + +BinaryDelayEstimatorFarend* WebRtc_CreateBinaryDelayEstimatorFarend( + int history_size) { + BinaryDelayEstimatorFarend* self = NULL; + + if (history_size > 1) { + // Sanity conditions fulfilled. + self = static_cast<BinaryDelayEstimatorFarend*>( + malloc(sizeof(BinaryDelayEstimatorFarend))); + } + if (self == NULL) { + return NULL; + } + + self->history_size = 0; + self->binary_far_history = NULL; + self->far_bit_counts = NULL; + if (WebRtc_AllocateFarendBufferMemory(self, history_size) == 0) { + WebRtc_FreeBinaryDelayEstimatorFarend(self); + self = NULL; + } + return self; +} + +int WebRtc_AllocateFarendBufferMemory(BinaryDelayEstimatorFarend* self, + int history_size) { + RTC_DCHECK(self); + // (Re-)Allocate memory for history buffers. + self->binary_far_history = static_cast<uint32_t*>( + realloc(self->binary_far_history, + history_size * sizeof(*self->binary_far_history))); + self->far_bit_counts = static_cast<int*>( + realloc(self->far_bit_counts, + history_size * sizeof(*self->far_bit_counts))); + if ((self->binary_far_history == NULL) || (self->far_bit_counts == NULL)) { + history_size = 0; + } + // Fill with zeros if we have expanded the buffers. + if (history_size > self->history_size) { + int size_diff = history_size - self->history_size; + memset(&self->binary_far_history[self->history_size], + 0, + sizeof(*self->binary_far_history) * size_diff); + memset(&self->far_bit_counts[self->history_size], + 0, + sizeof(*self->far_bit_counts) * size_diff); + } + self->history_size = history_size; + + return self->history_size; +} + +void WebRtc_InitBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self) { + RTC_DCHECK(self); + memset(self->binary_far_history, 0, sizeof(uint32_t) * self->history_size); + memset(self->far_bit_counts, 0, sizeof(int) * self->history_size); +} + +void WebRtc_SoftResetBinaryDelayEstimatorFarend( + BinaryDelayEstimatorFarend* self, int delay_shift) { + int abs_shift = abs(delay_shift); + int shift_size = 0; + int dest_index = 0; + int src_index = 0; + int padding_index = 0; + + RTC_DCHECK(self); + shift_size = self->history_size - abs_shift; + RTC_DCHECK_GT(shift_size, 0); + if (delay_shift == 0) { + return; + } else if (delay_shift > 0) { + dest_index = abs_shift; + } else if (delay_shift < 0) { + src_index = abs_shift; + padding_index = shift_size; + } + + // Shift and zero pad buffers. + memmove(&self->binary_far_history[dest_index], + &self->binary_far_history[src_index], + sizeof(*self->binary_far_history) * shift_size); + memset(&self->binary_far_history[padding_index], 0, + sizeof(*self->binary_far_history) * abs_shift); + memmove(&self->far_bit_counts[dest_index], + &self->far_bit_counts[src_index], + sizeof(*self->far_bit_counts) * shift_size); + memset(&self->far_bit_counts[padding_index], 0, + sizeof(*self->far_bit_counts) * abs_shift); +} + +void WebRtc_AddBinaryFarSpectrum(BinaryDelayEstimatorFarend* handle, + uint32_t binary_far_spectrum) { + RTC_DCHECK(handle); + // Shift binary spectrum history and insert current |binary_far_spectrum|. + memmove(&(handle->binary_far_history[1]), &(handle->binary_far_history[0]), + (handle->history_size - 1) * sizeof(uint32_t)); + handle->binary_far_history[0] = binary_far_spectrum; + + // Shift history of far-end binary spectrum bit counts and insert bit count + // of current |binary_far_spectrum|. + memmove(&(handle->far_bit_counts[1]), &(handle->far_bit_counts[0]), + (handle->history_size - 1) * sizeof(int)); + handle->far_bit_counts[0] = BitCount(binary_far_spectrum); +} + +void WebRtc_FreeBinaryDelayEstimator(BinaryDelayEstimator* self) { + + if (self == NULL) { + return; + } + + free(self->mean_bit_counts); + self->mean_bit_counts = NULL; + + free(self->bit_counts); + self->bit_counts = NULL; + + free(self->binary_near_history); + self->binary_near_history = NULL; + + free(self->histogram); + self->histogram = NULL; + + // BinaryDelayEstimator does not have ownership of |farend|, hence we do not + // free the memory here. That should be handled separately by the user. + self->farend = NULL; + + free(self); +} + +BinaryDelayEstimator* WebRtc_CreateBinaryDelayEstimator( + BinaryDelayEstimatorFarend* farend, int max_lookahead) { + BinaryDelayEstimator* self = NULL; + + if ((farend != NULL) && (max_lookahead >= 0)) { + // Sanity conditions fulfilled. + self = static_cast<BinaryDelayEstimator*>( + malloc(sizeof(BinaryDelayEstimator))); + } + if (self == NULL) { + return NULL; + } + + self->farend = farend; + self->near_history_size = max_lookahead + 1; + self->history_size = 0; + self->robust_validation_enabled = 0; // Disabled by default. + self->allowed_offset = 0; + + self->lookahead = max_lookahead; + + // Allocate memory for spectrum and history buffers. + self->mean_bit_counts = NULL; + self->bit_counts = NULL; + self->histogram = NULL; + self->binary_near_history = static_cast<uint32_t*>( + malloc((max_lookahead + 1) * sizeof(*self->binary_near_history))); + if (self->binary_near_history == NULL || + WebRtc_AllocateHistoryBufferMemory(self, farend->history_size) == 0) { + WebRtc_FreeBinaryDelayEstimator(self); + self = NULL; + } + + return self; +} + +int WebRtc_AllocateHistoryBufferMemory(BinaryDelayEstimator* self, + int history_size) { + BinaryDelayEstimatorFarend* far = self->farend; + // (Re-)Allocate memory for spectrum and history buffers. + if (history_size != far->history_size) { + // Only update far-end buffers if we need. + history_size = WebRtc_AllocateFarendBufferMemory(far, history_size); + } + // The extra array element in |mean_bit_counts| and |histogram| is a dummy + // element only used while |last_delay| == -2, i.e., before we have a valid + // estimate. + self->mean_bit_counts = static_cast<int32_t*>( + realloc(self->mean_bit_counts, + (history_size + 1) * sizeof(*self->mean_bit_counts))); + self->bit_counts = static_cast<int32_t*>( + realloc(self->bit_counts, history_size * sizeof(*self->bit_counts))); + self->histogram = static_cast<float*>( + realloc(self->histogram, (history_size + 1) * sizeof(*self->histogram))); + + if ((self->mean_bit_counts == NULL) || + (self->bit_counts == NULL) || + (self->histogram == NULL)) { + history_size = 0; + } + // Fill with zeros if we have expanded the buffers. + if (history_size > self->history_size) { + int size_diff = history_size - self->history_size; + memset(&self->mean_bit_counts[self->history_size], + 0, + sizeof(*self->mean_bit_counts) * size_diff); + memset(&self->bit_counts[self->history_size], + 0, + sizeof(*self->bit_counts) * size_diff); + memset(&self->histogram[self->history_size], + 0, + sizeof(*self->histogram) * size_diff); + } + self->history_size = history_size; + + return self->history_size; +} + +void WebRtc_InitBinaryDelayEstimator(BinaryDelayEstimator* self) { + int i = 0; + RTC_DCHECK(self); + + memset(self->bit_counts, 0, sizeof(int32_t) * self->history_size); + memset(self->binary_near_history, + 0, + sizeof(uint32_t) * self->near_history_size); + for (i = 0; i <= self->history_size; ++i) { + self->mean_bit_counts[i] = (20 << 9); // 20 in Q9. + self->histogram[i] = 0.f; + } + self->minimum_probability = kMaxBitCountsQ9; // 32 in Q9. + self->last_delay_probability = (int) kMaxBitCountsQ9; // 32 in Q9. + + // Default return value if we're unable to estimate. -1 is used for errors. + self->last_delay = -2; + + self->last_candidate_delay = -2; + self->compare_delay = self->history_size; + self->candidate_hits = 0; + self->last_delay_histogram = 0.f; +} + +int WebRtc_SoftResetBinaryDelayEstimator(BinaryDelayEstimator* self, + int delay_shift) { + int lookahead = 0; + RTC_DCHECK(self); + lookahead = self->lookahead; + self->lookahead -= delay_shift; + if (self->lookahead < 0) { + self->lookahead = 0; + } + if (self->lookahead > self->near_history_size - 1) { + self->lookahead = self->near_history_size - 1; + } + return lookahead - self->lookahead; +} + +int WebRtc_ProcessBinarySpectrum(BinaryDelayEstimator* self, + uint32_t binary_near_spectrum) { + int i = 0; + int candidate_delay = -1; + int valid_candidate = 0; + + int32_t value_best_candidate = kMaxBitCountsQ9; + int32_t value_worst_candidate = 0; + int32_t valley_depth = 0; + + RTC_DCHECK(self); + if (self->farend->history_size != self->history_size) { + // Non matching history sizes. + return -1; + } + if (self->near_history_size > 1) { + // If we apply lookahead, shift near-end binary spectrum history. Insert + // current |binary_near_spectrum| and pull out the delayed one. + memmove(&(self->binary_near_history[1]), &(self->binary_near_history[0]), + (self->near_history_size - 1) * sizeof(uint32_t)); + self->binary_near_history[0] = binary_near_spectrum; + binary_near_spectrum = self->binary_near_history[self->lookahead]; + } + + // Compare with delayed spectra and store the |bit_counts| for each delay. + BitCountComparison(binary_near_spectrum, self->farend->binary_far_history, + self->history_size, self->bit_counts); + + // Update |mean_bit_counts|, which is the smoothed version of |bit_counts|. + for (i = 0; i < self->history_size; i++) { + // |bit_counts| is constrained to [0, 32], meaning we can smooth with a + // factor up to 2^26. We use Q9. + int32_t bit_count = (self->bit_counts[i] << 9); // Q9. + + // Update |mean_bit_counts| only when far-end signal has something to + // contribute. If |far_bit_counts| is zero the far-end signal is weak and + // we likely have a poor echo condition, hence don't update. + if (self->farend->far_bit_counts[i] > 0) { + // Make number of right shifts piecewise linear w.r.t. |far_bit_counts|. + int shifts = kShiftsAtZero; + shifts -= (kShiftsLinearSlope * self->farend->far_bit_counts[i]) >> 4; + WebRtc_MeanEstimatorFix(bit_count, shifts, &(self->mean_bit_counts[i])); + } + } + + // Find |candidate_delay|, |value_best_candidate| and |value_worst_candidate| + // of |mean_bit_counts|. + for (i = 0; i < self->history_size; i++) { + if (self->mean_bit_counts[i] < value_best_candidate) { + value_best_candidate = self->mean_bit_counts[i]; + candidate_delay = i; + } + if (self->mean_bit_counts[i] > value_worst_candidate) { + value_worst_candidate = self->mean_bit_counts[i]; + } + } + valley_depth = value_worst_candidate - value_best_candidate; + + // The |value_best_candidate| is a good indicator on the probability of + // |candidate_delay| being an accurate delay (a small |value_best_candidate| + // means a good binary match). In the following sections we make a decision + // whether to update |last_delay| or not. + // 1) If the difference bit counts between the best and the worst delay + // candidates is too small we consider the situation to be unreliable and + // don't update |last_delay|. + // 2) If the situation is reliable we update |last_delay| if the value of the + // best candidate delay has a value less than + // i) an adaptive threshold |minimum_probability|, or + // ii) this corresponding value |last_delay_probability|, but updated at + // this time instant. + + // Update |minimum_probability|. + if ((self->minimum_probability > kProbabilityLowerLimit) && + (valley_depth > kProbabilityMinSpread)) { + // The "hard" threshold can't be lower than 17 (in Q9). + // The valley in the curve also has to be distinct, i.e., the + // difference between |value_worst_candidate| and |value_best_candidate| has + // to be large enough. + int32_t threshold = value_best_candidate + kProbabilityOffset; + if (threshold < kProbabilityLowerLimit) { + threshold = kProbabilityLowerLimit; + } + if (self->minimum_probability > threshold) { + self->minimum_probability = threshold; + } + } + // Update |last_delay_probability|. + // We use a Markov type model, i.e., a slowly increasing level over time. + self->last_delay_probability++; + // Validate |candidate_delay|. We have a reliable instantaneous delay + // estimate if + // 1) The valley is distinct enough (|valley_depth| > |kProbabilityOffset|) + // and + // 2) The depth of the valley is deep enough + // (|value_best_candidate| < |minimum_probability|) + // and deeper than the best estimate so far + // (|value_best_candidate| < |last_delay_probability|) + valid_candidate = ((valley_depth > kProbabilityOffset) && + ((value_best_candidate < self->minimum_probability) || + (value_best_candidate < self->last_delay_probability))); + + // Check for nonstationary farend signal. + const bool non_stationary_farend = + std::any_of(self->farend->far_bit_counts, + self->farend->far_bit_counts + self->history_size, + [](int a) { return a > 0; }); + + if (non_stationary_farend) { + // Only update the validation statistics when the farend is nonstationary + // as the underlying estimates are otherwise frozen. + UpdateRobustValidationStatistics(self, candidate_delay, valley_depth, + value_best_candidate); + } + + if (self->robust_validation_enabled) { + int is_histogram_valid = HistogramBasedValidation(self, candidate_delay); + valid_candidate = RobustValidation(self, candidate_delay, valid_candidate, + is_histogram_valid); + + } + + // Only update the delay estimate when the farend is nonstationary and when + // a valid delay candidate is available. + if (non_stationary_farend && valid_candidate) { + if (candidate_delay != self->last_delay) { + self->last_delay_histogram = + (self->histogram[candidate_delay] > kLastHistogramMax ? + kLastHistogramMax : self->histogram[candidate_delay]); + // Adjust the histogram if we made a change to |last_delay|, though it was + // not the most likely one according to the histogram. + if (self->histogram[candidate_delay] < + self->histogram[self->compare_delay]) { + self->histogram[self->compare_delay] = self->histogram[candidate_delay]; + } + } + self->last_delay = candidate_delay; + if (value_best_candidate < self->last_delay_probability) { + self->last_delay_probability = value_best_candidate; + } + self->compare_delay = self->last_delay; + } + + return self->last_delay; +} + +int WebRtc_binary_last_delay(BinaryDelayEstimator* self) { + RTC_DCHECK(self); + return self->last_delay; +} + +float WebRtc_binary_last_delay_quality(BinaryDelayEstimator* self) { + float quality = 0; + RTC_DCHECK(self); + + if (self->robust_validation_enabled) { + // Simply a linear function of the histogram height at delay estimate. + quality = self->histogram[self->compare_delay] / kHistogramMax; + } else { + // Note that |last_delay_probability| states how deep the minimum of the + // cost function is, so it is rather an error probability. + quality = (float) (kMaxBitCountsQ9 - self->last_delay_probability) / + kMaxBitCountsQ9; + if (quality < 0) { + quality = 0; + } + } + return quality; +} + +void WebRtc_MeanEstimatorFix(int32_t new_value, + int factor, + int32_t* mean_value) { + int32_t diff = new_value - *mean_value; + + // mean_new = mean_value + ((new_value - mean_value) >> factor); + if (diff < 0) { + diff = -((-diff) >> factor); + } else { + diff = (diff >> factor); + } + *mean_value += diff; +} diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.h new file mode 100644 index 0000000000..cce6113a53 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Performs delay estimation on binary converted spectra. +// The return value is 0 - OK and -1 - Error, unless otherwise stated. + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_H_ + +#include "typedefs.h" // NOLINT(build/include) + +static const int32_t kMaxBitCountsQ9 = (32 << 9); // 32 matching bits in Q9. + +typedef struct { + // Pointer to bit counts. + int* far_bit_counts; + // Binary history variables. + uint32_t* binary_far_history; + int history_size; +} BinaryDelayEstimatorFarend; + +typedef struct { + // Pointer to bit counts. + int32_t* mean_bit_counts; + // Array only used locally in ProcessBinarySpectrum() but whose size is + // determined at run-time. + int32_t* bit_counts; + + // Binary history variables. + uint32_t* binary_near_history; + int near_history_size; + int history_size; + + // Delay estimation variables. + int32_t minimum_probability; + int last_delay_probability; + + // Delay memory. + int last_delay; + + // Robust validation + int robust_validation_enabled; + int allowed_offset; + int last_candidate_delay; + int compare_delay; + int candidate_hits; + float* histogram; + float last_delay_histogram; + + // For dynamically changing the lookahead when using SoftReset...(). + int lookahead; + + // Far-end binary spectrum history buffer etc. + BinaryDelayEstimatorFarend* farend; +} BinaryDelayEstimator; + +// Releases the memory allocated by +// WebRtc_CreateBinaryDelayEstimatorFarend(...). +// Input: +// - self : Pointer to the binary delay estimation far-end +// instance which is the return value of +// WebRtc_CreateBinaryDelayEstimatorFarend(). +// +void WebRtc_FreeBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self); + +// Allocates the memory needed by the far-end part of the binary delay +// estimation. The memory needs to be initialized separately through +// WebRtc_InitBinaryDelayEstimatorFarend(...). +// +// Inputs: +// - history_size : Size of the far-end binary spectrum history. +// +// Return value: +// - BinaryDelayEstimatorFarend* +// : Created |handle|. If the memory can't be allocated +// or if any of the input parameters are invalid NULL +// is returned. +// +BinaryDelayEstimatorFarend* WebRtc_CreateBinaryDelayEstimatorFarend( + int history_size); + +// Re-allocates the buffers. +// +// Inputs: +// - self : Pointer to the binary estimation far-end instance +// which is the return value of +// WebRtc_CreateBinaryDelayEstimatorFarend(). +// - history_size : Size of the far-end binary spectrum history. +// +// Return value: +// - history_size : The history size allocated. +int WebRtc_AllocateFarendBufferMemory(BinaryDelayEstimatorFarend* self, + int history_size); + +// Initializes the delay estimation far-end instance created with +// WebRtc_CreateBinaryDelayEstimatorFarend(...). +// +// Input: +// - self : Pointer to the delay estimation far-end instance. +// +// Output: +// - self : Initialized far-end instance. +// +void WebRtc_InitBinaryDelayEstimatorFarend(BinaryDelayEstimatorFarend* self); + +// Soft resets the delay estimation far-end instance created with +// WebRtc_CreateBinaryDelayEstimatorFarend(...). +// +// Input: +// - delay_shift : The amount of blocks to shift history buffers. +// +void WebRtc_SoftResetBinaryDelayEstimatorFarend( + BinaryDelayEstimatorFarend* self, int delay_shift); + +// Adds the binary far-end spectrum to the internal far-end history buffer. This +// spectrum is used as reference when calculating the delay using +// WebRtc_ProcessBinarySpectrum(). +// +// Inputs: +// - self : Pointer to the delay estimation far-end +// instance. +// - binary_far_spectrum : Far-end binary spectrum. +// +// Output: +// - self : Updated far-end instance. +// +void WebRtc_AddBinaryFarSpectrum(BinaryDelayEstimatorFarend* self, + uint32_t binary_far_spectrum); + +// Releases the memory allocated by WebRtc_CreateBinaryDelayEstimator(...). +// +// Note that BinaryDelayEstimator utilizes BinaryDelayEstimatorFarend, but does +// not take ownership of it, hence the BinaryDelayEstimator has to be torn down +// before the far-end. +// +// Input: +// - self : Pointer to the binary delay estimation instance +// which is the return value of +// WebRtc_CreateBinaryDelayEstimator(). +// +void WebRtc_FreeBinaryDelayEstimator(BinaryDelayEstimator* self); + +// Allocates the memory needed by the binary delay estimation. The memory needs +// to be initialized separately through WebRtc_InitBinaryDelayEstimator(...). +// +// See WebRtc_CreateDelayEstimator(..) in delay_estimator_wrapper.c for detailed +// description. +BinaryDelayEstimator* WebRtc_CreateBinaryDelayEstimator( + BinaryDelayEstimatorFarend* farend, int max_lookahead); + +// Re-allocates |history_size| dependent buffers. The far-end buffers will be +// updated at the same time if needed. +// +// Input: +// - self : Pointer to the binary estimation instance which is +// the return value of +// WebRtc_CreateBinaryDelayEstimator(). +// - history_size : Size of the history buffers. +// +// Return value: +// - history_size : The history size allocated. +int WebRtc_AllocateHistoryBufferMemory(BinaryDelayEstimator* self, + int history_size); + +// Initializes the delay estimation instance created with +// WebRtc_CreateBinaryDelayEstimator(...). +// +// Input: +// - self : Pointer to the delay estimation instance. +// +// Output: +// - self : Initialized instance. +// +void WebRtc_InitBinaryDelayEstimator(BinaryDelayEstimator* self); + +// Soft resets the delay estimation instance created with +// WebRtc_CreateBinaryDelayEstimator(...). +// +// Input: +// - delay_shift : The amount of blocks to shift history buffers. +// +// Return value: +// - actual_shifts : The actual number of shifts performed. +// +int WebRtc_SoftResetBinaryDelayEstimator(BinaryDelayEstimator* self, + int delay_shift); + +// Estimates and returns the delay between the binary far-end and binary near- +// end spectra. It is assumed the binary far-end spectrum has been added using +// WebRtc_AddBinaryFarSpectrum() prior to this call. The value will be offset by +// the lookahead (i.e. the lookahead should be subtracted from the returned +// value). +// +// Inputs: +// - self : Pointer to the delay estimation instance. +// - binary_near_spectrum : Near-end binary spectrum of the current block. +// +// Output: +// - self : Updated instance. +// +// Return value: +// - delay : >= 0 - Calculated delay value. +// -2 - Insufficient data for estimation. +// +int WebRtc_ProcessBinarySpectrum(BinaryDelayEstimator* self, + uint32_t binary_near_spectrum); + +// Returns the last calculated delay updated by the function +// WebRtc_ProcessBinarySpectrum(...). +// +// Input: +// - self : Pointer to the delay estimation instance. +// +// Return value: +// - delay : >= 0 - Last calculated delay value +// -2 - Insufficient data for estimation. +// +int WebRtc_binary_last_delay(BinaryDelayEstimator* self); + +// Returns the estimation quality of the last calculated delay updated by the +// function WebRtc_ProcessBinarySpectrum(...). The estimation quality is a value +// in the interval [0, 1]. The higher the value, the better the quality. +// +// Return value: +// - delay_quality : >= 0 - Estimation quality of last calculated +// delay value. +float WebRtc_binary_last_delay_quality(BinaryDelayEstimator* self); + +// Updates the |mean_value| recursively with a step size of 2^-|factor|. This +// function is used internally in the Binary Delay Estimator as well as the +// Fixed point wrapper. +// +// Inputs: +// - new_value : The new value the mean should be updated with. +// - factor : The step size, in number of right shifts. +// +// Input/Output: +// - mean_value : Pointer to the mean value. +// +void WebRtc_MeanEstimatorFix(int32_t new_value, + int factor, + int32_t* mean_value); + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_internal.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_internal.h new file mode 100644 index 0000000000..46eea3ec18 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_internal.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Header file including the delay estimator handle used for testing. + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_INTERNAL_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_INTERNAL_H_ + +#include "modules/audio_processing/utility/delay_estimator.h" +#include "typedefs.h" // NOLINT(build/include) + +typedef union { + float float_; + int32_t int32_; +} SpectrumType; + +typedef struct { + // Pointers to mean values of spectrum. + SpectrumType* mean_far_spectrum; + // |mean_far_spectrum| initialization indicator. + int far_spectrum_initialized; + + int spectrum_size; + + // Far-end part of binary spectrum based delay estimation. + BinaryDelayEstimatorFarend* binary_farend; +} DelayEstimatorFarend; + +typedef struct { + // Pointers to mean values of spectrum. + SpectrumType* mean_near_spectrum; + // |mean_near_spectrum| initialization indicator. + int near_spectrum_initialized; + + int spectrum_size; + + // Binary spectrum based delay estimator + BinaryDelayEstimator* binary_handle; +} DelayEstimator; + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_INTERNAL_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_unittest.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_unittest.cc new file mode 100644 index 0000000000..36700e5706 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_unittest.cc @@ -0,0 +1,618 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/utility/delay_estimator.h" +#include "modules/audio_processing/utility/delay_estimator_internal.h" +#include "modules/audio_processing/utility/delay_estimator_wrapper.h" +#include "test/gtest.h" +#include "typedefs.h" // NOLINT(build/include) + +namespace { + +enum { kSpectrumSize = 65 }; +// Delay history sizes. +enum { kMaxDelay = 100 }; +enum { kLookahead = 10 }; +enum { kHistorySize = kMaxDelay + kLookahead }; +// Length of binary spectrum sequence. +enum { kSequenceLength = 400 }; + +const int kDifferentHistorySize = 3; +const int kDifferentLookahead = 1; + +const int kEnable[] = { 0, 1 }; +const size_t kSizeEnable = sizeof(kEnable) / sizeof(*kEnable); + +class DelayEstimatorTest : public ::testing::Test { + protected: + DelayEstimatorTest(); + virtual void SetUp(); + virtual void TearDown(); + + void Init(); + void InitBinary(); + void VerifyDelay(BinaryDelayEstimator* binary_handle, int offset, int delay); + void RunBinarySpectra(BinaryDelayEstimator* binary1, + BinaryDelayEstimator* binary2, + int near_offset, int lookahead_offset, int far_offset); + void RunBinarySpectraTest(int near_offset, int lookahead_offset, + int ref_robust_validation, int robust_validation); + + void* handle_; + DelayEstimator* self_; + void* farend_handle_; + DelayEstimatorFarend* farend_self_; + BinaryDelayEstimator* binary_; + BinaryDelayEstimatorFarend* binary_farend_; + int spectrum_size_; + // Dummy input spectra. + float far_f_[kSpectrumSize]; + float near_f_[kSpectrumSize]; + uint16_t far_u16_[kSpectrumSize]; + uint16_t near_u16_[kSpectrumSize]; + uint32_t binary_spectrum_[kSequenceLength + kHistorySize]; +}; + +DelayEstimatorTest::DelayEstimatorTest() + : handle_(NULL), + self_(NULL), + farend_handle_(NULL), + farend_self_(NULL), + binary_(NULL), + binary_farend_(NULL), + spectrum_size_(kSpectrumSize) { + // Dummy input data are set with more or less arbitrary non-zero values. + memset(far_f_, 1, sizeof(far_f_)); + memset(near_f_, 2, sizeof(near_f_)); + memset(far_u16_, 1, sizeof(far_u16_)); + memset(near_u16_, 2, sizeof(near_u16_)); + // Construct a sequence of binary spectra used to verify delay estimate. The + // |kSequenceLength| has to be long enough for the delay estimation to leave + // the initialized state. + binary_spectrum_[0] = 1; + for (int i = 1; i < (kSequenceLength + kHistorySize); i++) { + binary_spectrum_[i] = 3 * binary_spectrum_[i - 1]; + } +} + +void DelayEstimatorTest::SetUp() { + farend_handle_ = WebRtc_CreateDelayEstimatorFarend(kSpectrumSize, + kHistorySize); + ASSERT_TRUE(farend_handle_ != NULL); + farend_self_ = reinterpret_cast<DelayEstimatorFarend*>(farend_handle_); + handle_ = WebRtc_CreateDelayEstimator(farend_handle_, kLookahead); + ASSERT_TRUE(handle_ != NULL); + self_ = reinterpret_cast<DelayEstimator*>(handle_); + binary_farend_ = WebRtc_CreateBinaryDelayEstimatorFarend(kHistorySize); + ASSERT_TRUE(binary_farend_ != NULL); + binary_ = WebRtc_CreateBinaryDelayEstimator(binary_farend_, kLookahead); + ASSERT_TRUE(binary_ != NULL); +} + +void DelayEstimatorTest::TearDown() { + WebRtc_FreeDelayEstimator(handle_); + handle_ = NULL; + self_ = NULL; + WebRtc_FreeDelayEstimatorFarend(farend_handle_); + farend_handle_ = NULL; + farend_self_ = NULL; + WebRtc_FreeBinaryDelayEstimator(binary_); + binary_ = NULL; + WebRtc_FreeBinaryDelayEstimatorFarend(binary_farend_); + binary_farend_ = NULL; +} + +void DelayEstimatorTest::Init() { + // Initialize Delay Estimator + EXPECT_EQ(0, WebRtc_InitDelayEstimatorFarend(farend_handle_)); + EXPECT_EQ(0, WebRtc_InitDelayEstimator(handle_)); + // Verify initialization. + EXPECT_EQ(0, farend_self_->far_spectrum_initialized); + EXPECT_EQ(0, self_->near_spectrum_initialized); + EXPECT_EQ(-2, WebRtc_last_delay(handle_)); // Delay in initial state. + EXPECT_FLOAT_EQ(0, WebRtc_last_delay_quality(handle_)); // Zero quality. +} + +void DelayEstimatorTest::InitBinary() { + // Initialize Binary Delay Estimator (far-end part). + WebRtc_InitBinaryDelayEstimatorFarend(binary_farend_); + // Initialize Binary Delay Estimator + WebRtc_InitBinaryDelayEstimator(binary_); + // Verify initialization. This does not guarantee a complete check, since + // |last_delay| may be equal to -2 before initialization if done on the fly. + EXPECT_EQ(-2, binary_->last_delay); +} + +void DelayEstimatorTest::VerifyDelay(BinaryDelayEstimator* binary_handle, + int offset, int delay) { + // Verify that we WebRtc_binary_last_delay() returns correct delay. + EXPECT_EQ(delay, WebRtc_binary_last_delay(binary_handle)); + + if (delay != -2) { + // Verify correct delay estimate. In the non-causal case the true delay + // is equivalent with the |offset|. + EXPECT_EQ(offset, delay); + } +} + +void DelayEstimatorTest::RunBinarySpectra(BinaryDelayEstimator* binary1, + BinaryDelayEstimator* binary2, + int near_offset, + int lookahead_offset, + int far_offset) { + int different_validations = binary1->robust_validation_enabled ^ + binary2->robust_validation_enabled; + WebRtc_InitBinaryDelayEstimatorFarend(binary_farend_); + WebRtc_InitBinaryDelayEstimator(binary1); + WebRtc_InitBinaryDelayEstimator(binary2); + // Verify initialization. This does not guarantee a complete check, since + // |last_delay| may be equal to -2 before initialization if done on the fly. + EXPECT_EQ(-2, binary1->last_delay); + EXPECT_EQ(-2, binary2->last_delay); + for (int i = kLookahead; i < (kSequenceLength + kLookahead); i++) { + WebRtc_AddBinaryFarSpectrum(binary_farend_, + binary_spectrum_[i + far_offset]); + int delay_1 = WebRtc_ProcessBinarySpectrum(binary1, binary_spectrum_[i]); + int delay_2 = + WebRtc_ProcessBinarySpectrum(binary2, + binary_spectrum_[i - near_offset]); + + VerifyDelay(binary1, far_offset + kLookahead, delay_1); + VerifyDelay(binary2, + far_offset + kLookahead + lookahead_offset + near_offset, + delay_2); + // Expect the two delay estimates to be offset by |lookahead_offset| + + // |near_offset| when we have left the initial state. + if ((delay_1 != -2) && (delay_2 != -2)) { + EXPECT_EQ(delay_1, delay_2 - lookahead_offset - near_offset); + } + // For the case of identical signals |delay_1| and |delay_2| should match + // all the time, unless one of them has robust validation turned on. In + // that case the robust validation leaves the initial state faster. + if ((near_offset == 0) && (lookahead_offset == 0)) { + if (!different_validations) { + EXPECT_EQ(delay_1, delay_2); + } else { + if (binary1->robust_validation_enabled) { + EXPECT_GE(delay_1, delay_2); + } else { + EXPECT_GE(delay_2, delay_1); + } + } + } + } + // Verify that we have left the initialized state. + EXPECT_NE(-2, WebRtc_binary_last_delay(binary1)); + EXPECT_LT(0, WebRtc_binary_last_delay_quality(binary1)); + EXPECT_NE(-2, WebRtc_binary_last_delay(binary2)); + EXPECT_LT(0, WebRtc_binary_last_delay_quality(binary2)); +} + +void DelayEstimatorTest::RunBinarySpectraTest(int near_offset, + int lookahead_offset, + int ref_robust_validation, + int robust_validation) { + BinaryDelayEstimator* binary2 = + WebRtc_CreateBinaryDelayEstimator(binary_farend_, + kLookahead + lookahead_offset); + // Verify the delay for both causal and non-causal systems. For causal systems + // the delay is equivalent with a positive |offset| of the far-end sequence. + // For non-causal systems the delay is equivalent with a negative |offset| of + // the far-end sequence. + binary_->robust_validation_enabled = ref_robust_validation; + binary2->robust_validation_enabled = robust_validation; + for (int offset = -kLookahead; + offset < kMaxDelay - lookahead_offset - near_offset; + offset++) { + RunBinarySpectra(binary_, binary2, near_offset, lookahead_offset, offset); + } + WebRtc_FreeBinaryDelayEstimator(binary2); + binary2 = NULL; + binary_->robust_validation_enabled = 0; // Reset reference. +} + +TEST_F(DelayEstimatorTest, CorrectErrorReturnsOfWrapper) { + // In this test we verify correct error returns on invalid API calls. + + // WebRtc_CreateDelayEstimatorFarend() and WebRtc_CreateDelayEstimator() + // should return a NULL pointer on invalid input values. + // Make sure we have a non-NULL value at start, so we can detect NULL after + // create failure. + void* handle = farend_handle_; + handle = WebRtc_CreateDelayEstimatorFarend(33, kHistorySize); + EXPECT_TRUE(handle == NULL); + handle = WebRtc_CreateDelayEstimatorFarend(kSpectrumSize, 1); + EXPECT_TRUE(handle == NULL); + + handle = handle_; + handle = WebRtc_CreateDelayEstimator(NULL, kLookahead); + EXPECT_TRUE(handle == NULL); + handle = WebRtc_CreateDelayEstimator(farend_handle_, -1); + EXPECT_TRUE(handle == NULL); + + // WebRtc_InitDelayEstimatorFarend() and WebRtc_InitDelayEstimator() should + // return -1 if we have a NULL pointer as |handle|. + EXPECT_EQ(-1, WebRtc_InitDelayEstimatorFarend(NULL)); + EXPECT_EQ(-1, WebRtc_InitDelayEstimator(NULL)); + + // WebRtc_AddFarSpectrumFloat() should return -1 if we have: + // 1) NULL pointer as |handle|. + // 2) NULL pointer as far-end spectrum. + // 3) Incorrect spectrum size. + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFloat(NULL, far_f_, spectrum_size_)); + // Use |farend_handle_| which is properly created at SetUp(). + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFloat(farend_handle_, NULL, + spectrum_size_)); + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFloat(farend_handle_, far_f_, + spectrum_size_ + 1)); + + // WebRtc_AddFarSpectrumFix() should return -1 if we have: + // 1) NULL pointer as |handle|. + // 2) NULL pointer as far-end spectrum. + // 3) Incorrect spectrum size. + // 4) Too high precision in far-end spectrum (Q-domain > 15). + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(NULL, far_u16_, spectrum_size_, 0)); + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(farend_handle_, NULL, spectrum_size_, + 0)); + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_, + spectrum_size_ + 1, 0)); + EXPECT_EQ(-1, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_, + spectrum_size_, 16)); + + // WebRtc_set_history_size() should return -1 if: + // 1) |handle| is a NULL. + // 2) |history_size| <= 1. + EXPECT_EQ(-1, WebRtc_set_history_size(NULL, 1)); + EXPECT_EQ(-1, WebRtc_set_history_size(handle_, 1)); + // WebRtc_history_size() should return -1 if: + // 1) NULL pointer input. + EXPECT_EQ(-1, WebRtc_history_size(NULL)); + // 2) there is a mismatch between history size. + void* tmp_handle = WebRtc_CreateDelayEstimator(farend_handle_, kHistorySize); + EXPECT_EQ(0, WebRtc_InitDelayEstimator(tmp_handle)); + EXPECT_EQ(kDifferentHistorySize, + WebRtc_set_history_size(tmp_handle, kDifferentHistorySize)); + EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(tmp_handle)); + EXPECT_EQ(kHistorySize, WebRtc_set_history_size(handle_, kHistorySize)); + EXPECT_EQ(-1, WebRtc_history_size(tmp_handle)); + + // WebRtc_set_lookahead() should return -1 if we try a value outside the + /// buffer. + EXPECT_EQ(-1, WebRtc_set_lookahead(handle_, kLookahead + 1)); + EXPECT_EQ(-1, WebRtc_set_lookahead(handle_, -1)); + + // WebRtc_set_allowed_offset() should return -1 if we have: + // 1) NULL pointer as |handle|. + // 2) |allowed_offset| < 0. + EXPECT_EQ(-1, WebRtc_set_allowed_offset(NULL, 0)); + EXPECT_EQ(-1, WebRtc_set_allowed_offset(handle_, -1)); + + EXPECT_EQ(-1, WebRtc_get_allowed_offset(NULL)); + + // WebRtc_enable_robust_validation() should return -1 if we have: + // 1) NULL pointer as |handle|. + // 2) Incorrect |enable| value (not 0 or 1). + EXPECT_EQ(-1, WebRtc_enable_robust_validation(NULL, kEnable[0])); + EXPECT_EQ(-1, WebRtc_enable_robust_validation(handle_, -1)); + EXPECT_EQ(-1, WebRtc_enable_robust_validation(handle_, 2)); + + // WebRtc_is_robust_validation_enabled() should return -1 if we have NULL + // pointer as |handle|. + EXPECT_EQ(-1, WebRtc_is_robust_validation_enabled(NULL)); + + // WebRtc_DelayEstimatorProcessFloat() should return -1 if we have: + // 1) NULL pointer as |handle|. + // 2) NULL pointer as near-end spectrum. + // 3) Incorrect spectrum size. + // 4) Non matching history sizes if multiple delay estimators using the same + // far-end reference. + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(NULL, near_f_, + spectrum_size_)); + // Use |handle_| which is properly created at SetUp(). + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(handle_, NULL, + spectrum_size_)); + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(handle_, near_f_, + spectrum_size_ + 1)); + // |tmp_handle| is already in a non-matching state. + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFloat(tmp_handle, + near_f_, + spectrum_size_)); + + // WebRtc_DelayEstimatorProcessFix() should return -1 if we have: + // 1) NULL pointer as |handle|. + // 2) NULL pointer as near-end spectrum. + // 3) Incorrect spectrum size. + // 4) Too high precision in near-end spectrum (Q-domain > 15). + // 5) Non matching history sizes if multiple delay estimators using the same + // far-end reference. + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(NULL, near_u16_, spectrum_size_, + 0)); + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(handle_, NULL, spectrum_size_, + 0)); + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(handle_, near_u16_, + spectrum_size_ + 1, 0)); + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(handle_, near_u16_, + spectrum_size_, 16)); + // |tmp_handle| is already in a non-matching state. + EXPECT_EQ(-1, WebRtc_DelayEstimatorProcessFix(tmp_handle, + near_u16_, + spectrum_size_, + 0)); + WebRtc_FreeDelayEstimator(tmp_handle); + + // WebRtc_last_delay() should return -1 if we have a NULL pointer as |handle|. + EXPECT_EQ(-1, WebRtc_last_delay(NULL)); + + // Free any local memory if needed. + WebRtc_FreeDelayEstimator(handle); +} + +TEST_F(DelayEstimatorTest, VerifyAllowedOffset) { + // Is set to zero by default. + EXPECT_EQ(0, WebRtc_get_allowed_offset(handle_)); + for (int i = 1; i >= 0; i--) { + EXPECT_EQ(0, WebRtc_set_allowed_offset(handle_, i)); + EXPECT_EQ(i, WebRtc_get_allowed_offset(handle_)); + Init(); + // Unaffected over a reset. + EXPECT_EQ(i, WebRtc_get_allowed_offset(handle_)); + } +} + +TEST_F(DelayEstimatorTest, VerifyEnableRobustValidation) { + // Disabled by default. + EXPECT_EQ(0, WebRtc_is_robust_validation_enabled(handle_)); + for (size_t i = 0; i < kSizeEnable; ++i) { + EXPECT_EQ(0, WebRtc_enable_robust_validation(handle_, kEnable[i])); + EXPECT_EQ(kEnable[i], WebRtc_is_robust_validation_enabled(handle_)); + Init(); + // Unaffected over a reset. + EXPECT_EQ(kEnable[i], WebRtc_is_robust_validation_enabled(handle_)); + } +} + +TEST_F(DelayEstimatorTest, InitializedSpectrumAfterProcess) { + // In this test we verify that the mean spectra are initialized after first + // time we call WebRtc_AddFarSpectrum() and Process() respectively. The test + // also verifies the state is not left for zero spectra. + const float kZerosFloat[kSpectrumSize] = { 0.0 }; + const uint16_t kZerosU16[kSpectrumSize] = { 0 }; + + // For floating point operations, process one frame and verify initialization + // flag. + Init(); + EXPECT_EQ(0, WebRtc_AddFarSpectrumFloat(farend_handle_, kZerosFloat, + spectrum_size_)); + EXPECT_EQ(0, farend_self_->far_spectrum_initialized); + EXPECT_EQ(0, WebRtc_AddFarSpectrumFloat(farend_handle_, far_f_, + spectrum_size_)); + EXPECT_EQ(1, farend_self_->far_spectrum_initialized); + EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFloat(handle_, kZerosFloat, + spectrum_size_)); + EXPECT_EQ(0, self_->near_spectrum_initialized); + EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFloat(handle_, near_f_, + spectrum_size_)); + EXPECT_EQ(1, self_->near_spectrum_initialized); + + // For fixed point operations, process one frame and verify initialization + // flag. + Init(); + EXPECT_EQ(0, WebRtc_AddFarSpectrumFix(farend_handle_, kZerosU16, + spectrum_size_, 0)); + EXPECT_EQ(0, farend_self_->far_spectrum_initialized); + EXPECT_EQ(0, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_, + spectrum_size_, 0)); + EXPECT_EQ(1, farend_self_->far_spectrum_initialized); + EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFix(handle_, kZerosU16, + spectrum_size_, 0)); + EXPECT_EQ(0, self_->near_spectrum_initialized); + EXPECT_EQ(-2, WebRtc_DelayEstimatorProcessFix(handle_, near_u16_, + spectrum_size_, 0)); + EXPECT_EQ(1, self_->near_spectrum_initialized); +} + +TEST_F(DelayEstimatorTest, CorrectLastDelay) { + // In this test we verify that we get the correct last delay upon valid call. + // We simply process the same data until we leave the initialized state + // (|last_delay| = -2). Then we compare the Process() output with the + // last_delay() call. + + // TODO(bjornv): Update quality values for robust validation. + int last_delay = 0; + // Floating point operations. + Init(); + for (int i = 0; i < 200; i++) { + EXPECT_EQ(0, WebRtc_AddFarSpectrumFloat(farend_handle_, far_f_, + spectrum_size_)); + last_delay = WebRtc_DelayEstimatorProcessFloat(handle_, near_f_, + spectrum_size_); + if (last_delay != -2) { + EXPECT_EQ(last_delay, WebRtc_last_delay(handle_)); + if (!WebRtc_is_robust_validation_enabled(handle_)) { + EXPECT_FLOAT_EQ(7203.f / kMaxBitCountsQ9, + WebRtc_last_delay_quality(handle_)); + } + break; + } + } + // Verify that we have left the initialized state. + EXPECT_NE(-2, WebRtc_last_delay(handle_)); + EXPECT_LT(0, WebRtc_last_delay_quality(handle_)); + + // Fixed point operations. + Init(); + for (int i = 0; i < 200; i++) { + EXPECT_EQ(0, WebRtc_AddFarSpectrumFix(farend_handle_, far_u16_, + spectrum_size_, 0)); + last_delay = WebRtc_DelayEstimatorProcessFix(handle_, near_u16_, + spectrum_size_, 0); + if (last_delay != -2) { + EXPECT_EQ(last_delay, WebRtc_last_delay(handle_)); + if (!WebRtc_is_robust_validation_enabled(handle_)) { + EXPECT_FLOAT_EQ(7203.f / kMaxBitCountsQ9, + WebRtc_last_delay_quality(handle_)); + } + break; + } + } + // Verify that we have left the initialized state. + EXPECT_NE(-2, WebRtc_last_delay(handle_)); + EXPECT_LT(0, WebRtc_last_delay_quality(handle_)); +} + +TEST_F(DelayEstimatorTest, CorrectErrorReturnsOfBinaryEstimatorFarend) { + // In this test we verify correct output on invalid API calls to the Binary + // Delay Estimator (far-end part). + + BinaryDelayEstimatorFarend* binary = binary_farend_; + // WebRtc_CreateBinaryDelayEstimatorFarend() should return -1 if the input + // history size is less than 2. This is to make sure the buffer shifting + // applies properly. + // Make sure we have a non-NULL value at start, so we can detect NULL after + // create failure. + binary = WebRtc_CreateBinaryDelayEstimatorFarend(1); + EXPECT_TRUE(binary == NULL); +} + +TEST_F(DelayEstimatorTest, CorrectErrorReturnsOfBinaryEstimator) { + // In this test we verify correct output on invalid API calls to the Binary + // Delay Estimator. + + BinaryDelayEstimator* binary_handle = binary_; + // WebRtc_CreateBinaryDelayEstimator() should return -1 if we have a NULL + // pointer as |binary_farend| or invalid input values. Upon failure, the + // |binary_handle| should be NULL. + // Make sure we have a non-NULL value at start, so we can detect NULL after + // create failure. + binary_handle = WebRtc_CreateBinaryDelayEstimator(NULL, kLookahead); + EXPECT_TRUE(binary_handle == NULL); + binary_handle = WebRtc_CreateBinaryDelayEstimator(binary_farend_, -1); + EXPECT_TRUE(binary_handle == NULL); +} + +TEST_F(DelayEstimatorTest, MeanEstimatorFix) { + // In this test we verify that we update the mean value in correct direction + // only. With "direction" we mean increase or decrease. + + int32_t mean_value = 4000; + int32_t mean_value_before = mean_value; + int32_t new_mean_value = mean_value * 2; + + // Increasing |mean_value|. + WebRtc_MeanEstimatorFix(new_mean_value, 10, &mean_value); + EXPECT_LT(mean_value_before, mean_value); + EXPECT_GT(new_mean_value, mean_value); + + // Decreasing |mean_value|. + new_mean_value = mean_value / 2; + mean_value_before = mean_value; + WebRtc_MeanEstimatorFix(new_mean_value, 10, &mean_value); + EXPECT_GT(mean_value_before, mean_value); + EXPECT_LT(new_mean_value, mean_value); +} + +TEST_F(DelayEstimatorTest, ExactDelayEstimateMultipleNearSameSpectrum) { + // In this test we verify that we get the correct delay estimates if we shift + // the signal accordingly. We create two Binary Delay Estimators and feed them + // with the same signals, so they should output the same results. + // We verify both causal and non-causal delays. + // For these noise free signals, the robust validation should not have an + // impact, hence we turn robust validation on/off for both reference and + // delayed near end. + + for (size_t i = 0; i < kSizeEnable; ++i) { + for (size_t j = 0; j < kSizeEnable; ++j) { + RunBinarySpectraTest(0, 0, kEnable[i], kEnable[j]); + } + } +} + +TEST_F(DelayEstimatorTest, ExactDelayEstimateMultipleNearDifferentSpectrum) { + // In this test we use the same setup as above, but we now feed the two Binary + // Delay Estimators with different signals, so they should output different + // results. + // For these noise free signals, the robust validation should not have an + // impact, hence we turn robust validation on/off for both reference and + // delayed near end. + + const int kNearOffset = 1; + for (size_t i = 0; i < kSizeEnable; ++i) { + for (size_t j = 0; j < kSizeEnable; ++j) { + RunBinarySpectraTest(kNearOffset, 0, kEnable[i], kEnable[j]); + } + } +} + +TEST_F(DelayEstimatorTest, ExactDelayEstimateMultipleNearDifferentLookahead) { + // In this test we use the same setup as above, feeding the two Binary + // Delay Estimators with the same signals. The difference is that we create + // them with different lookahead. + // For these noise free signals, the robust validation should not have an + // impact, hence we turn robust validation on/off for both reference and + // delayed near end. + + const int kLookaheadOffset = 1; + for (size_t i = 0; i < kSizeEnable; ++i) { + for (size_t j = 0; j < kSizeEnable; ++j) { + RunBinarySpectraTest(0, kLookaheadOffset, kEnable[i], kEnable[j]); + } + } +} + +TEST_F(DelayEstimatorTest, AllowedOffsetNoImpactWhenRobustValidationDisabled) { + // The same setup as in ExactDelayEstimateMultipleNearSameSpectrum with the + // difference that |allowed_offset| is set for the reference binary delay + // estimator. + + binary_->allowed_offset = 10; + RunBinarySpectraTest(0, 0, 0, 0); + binary_->allowed_offset = 0; // Reset reference. +} + +TEST_F(DelayEstimatorTest, VerifyLookaheadAtCreate) { + void* farend_handle = WebRtc_CreateDelayEstimatorFarend(kSpectrumSize, + kMaxDelay); + ASSERT_TRUE(farend_handle != NULL); + void* handle = WebRtc_CreateDelayEstimator(farend_handle, kLookahead); + ASSERT_TRUE(handle != NULL); + EXPECT_EQ(kLookahead, WebRtc_lookahead(handle)); + WebRtc_FreeDelayEstimator(handle); + WebRtc_FreeDelayEstimatorFarend(farend_handle); +} + +TEST_F(DelayEstimatorTest, VerifyLookaheadIsSetAndKeptAfterInit) { + EXPECT_EQ(kLookahead, WebRtc_lookahead(handle_)); + EXPECT_EQ(kDifferentLookahead, + WebRtc_set_lookahead(handle_, kDifferentLookahead)); + EXPECT_EQ(kDifferentLookahead, WebRtc_lookahead(handle_)); + EXPECT_EQ(0, WebRtc_InitDelayEstimatorFarend(farend_handle_)); + EXPECT_EQ(kDifferentLookahead, WebRtc_lookahead(handle_)); + EXPECT_EQ(0, WebRtc_InitDelayEstimator(handle_)); + EXPECT_EQ(kDifferentLookahead, WebRtc_lookahead(handle_)); +} + +TEST_F(DelayEstimatorTest, VerifyHistorySizeAtCreate) { + EXPECT_EQ(kHistorySize, WebRtc_history_size(handle_)); +} + +TEST_F(DelayEstimatorTest, VerifyHistorySizeIsSetAndKeptAfterInit) { + EXPECT_EQ(kHistorySize, WebRtc_history_size(handle_)); + EXPECT_EQ(kDifferentHistorySize, + WebRtc_set_history_size(handle_, kDifferentHistorySize)); + EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(handle_)); + EXPECT_EQ(0, WebRtc_InitDelayEstimator(handle_)); + EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(handle_)); + EXPECT_EQ(0, WebRtc_InitDelayEstimatorFarend(farend_handle_)); + EXPECT_EQ(kDifferentHistorySize, WebRtc_history_size(handle_)); +} + +// TODO(bjornv): Add tests for SoftReset...(...). + +} // namespace diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.cc new file mode 100644 index 0000000000..f907c80a35 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.cc @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/utility/delay_estimator_wrapper.h" + +#include <stdlib.h> +#include <string.h> + +#include "modules/audio_processing/utility/delay_estimator.h" +#include "modules/audio_processing/utility/delay_estimator_internal.h" +#include "rtc_base/checks.h" + +// Only bit |kBandFirst| through bit |kBandLast| are processed and +// |kBandFirst| - |kBandLast| must be < 32. +enum { kBandFirst = 12 }; +enum { kBandLast = 43 }; + +static __inline uint32_t SetBit(uint32_t in, int pos) { + uint32_t mask = (1 << pos); + uint32_t out = (in | mask); + + return out; +} + +// Calculates the mean recursively. Same version as WebRtc_MeanEstimatorFix(), +// but for float. +// +// Inputs: +// - new_value : New additional value. +// - scale : Scale for smoothing (should be less than 1.0). +// +// Input/Output: +// - mean_value : Pointer to the mean value for updating. +// +static void MeanEstimatorFloat(float new_value, + float scale, + float* mean_value) { + RTC_DCHECK_LT(scale, 1.0f); + *mean_value += (new_value - *mean_value) * scale; +} + +// Computes the binary spectrum by comparing the input |spectrum| with a +// |threshold_spectrum|. Float and fixed point versions. +// +// Inputs: +// - spectrum : Spectrum of which the binary spectrum should be +// calculated. +// - threshold_spectrum : Threshold spectrum with which the input +// spectrum is compared. +// Return: +// - out : Binary spectrum. +// +static uint32_t BinarySpectrumFix(const uint16_t* spectrum, + SpectrumType* threshold_spectrum, + int q_domain, + int* threshold_initialized) { + int i = kBandFirst; + uint32_t out = 0; + + RTC_DCHECK_LT(q_domain, 16); + + if (!(*threshold_initialized)) { + // Set the |threshold_spectrum| to half the input |spectrum| as starting + // value. This speeds up the convergence. + for (i = kBandFirst; i <= kBandLast; i++) { + if (spectrum[i] > 0) { + // Convert input spectrum from Q(|q_domain|) to Q15. + int32_t spectrum_q15 = ((int32_t) spectrum[i]) << (15 - q_domain); + threshold_spectrum[i].int32_ = (spectrum_q15 >> 1); + *threshold_initialized = 1; + } + } + } + for (i = kBandFirst; i <= kBandLast; i++) { + // Convert input spectrum from Q(|q_domain|) to Q15. + int32_t spectrum_q15 = ((int32_t) spectrum[i]) << (15 - q_domain); + // Update the |threshold_spectrum|. + WebRtc_MeanEstimatorFix(spectrum_q15, 6, &(threshold_spectrum[i].int32_)); + // Convert |spectrum| at current frequency bin to a binary value. + if (spectrum_q15 > threshold_spectrum[i].int32_) { + out = SetBit(out, i - kBandFirst); + } + } + + return out; +} + +static uint32_t BinarySpectrumFloat(const float* spectrum, + SpectrumType* threshold_spectrum, + int* threshold_initialized) { + int i = kBandFirst; + uint32_t out = 0; + const float kScale = 1 / 64.0; + + if (!(*threshold_initialized)) { + // Set the |threshold_spectrum| to half the input |spectrum| as starting + // value. This speeds up the convergence. + for (i = kBandFirst; i <= kBandLast; i++) { + if (spectrum[i] > 0.0f) { + threshold_spectrum[i].float_ = (spectrum[i] / 2); + *threshold_initialized = 1; + } + } + } + + for (i = kBandFirst; i <= kBandLast; i++) { + // Update the |threshold_spectrum|. + MeanEstimatorFloat(spectrum[i], kScale, &(threshold_spectrum[i].float_)); + // Convert |spectrum| at current frequency bin to a binary value. + if (spectrum[i] > threshold_spectrum[i].float_) { + out = SetBit(out, i - kBandFirst); + } + } + + return out; +} + +void WebRtc_FreeDelayEstimatorFarend(void* handle) { + DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle; + + if (handle == NULL) { + return; + } + + free(self->mean_far_spectrum); + self->mean_far_spectrum = NULL; + + WebRtc_FreeBinaryDelayEstimatorFarend(self->binary_farend); + self->binary_farend = NULL; + + free(self); +} + +void* WebRtc_CreateDelayEstimatorFarend(int spectrum_size, int history_size) { + DelayEstimatorFarend* self = NULL; + + // Check if the sub band used in the delay estimation is small enough to fit + // the binary spectra in a uint32_t. + static_assert(kBandLast - kBandFirst < 32, ""); + + if (spectrum_size >= kBandLast) { + self = static_cast<DelayEstimatorFarend*>( + malloc(sizeof(DelayEstimatorFarend))); + } + + if (self != NULL) { + int memory_fail = 0; + + // Allocate memory for the binary far-end spectrum handling. + self->binary_farend = WebRtc_CreateBinaryDelayEstimatorFarend(history_size); + memory_fail |= (self->binary_farend == NULL); + + // Allocate memory for spectrum buffers. + self->mean_far_spectrum = + static_cast<SpectrumType*>(malloc(spectrum_size * sizeof(SpectrumType))); + memory_fail |= (self->mean_far_spectrum == NULL); + + self->spectrum_size = spectrum_size; + + if (memory_fail) { + WebRtc_FreeDelayEstimatorFarend(self); + self = NULL; + } + } + + return self; +} + +int WebRtc_InitDelayEstimatorFarend(void* handle) { + DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle; + + if (self == NULL) { + return -1; + } + + // Initialize far-end part of binary delay estimator. + WebRtc_InitBinaryDelayEstimatorFarend(self->binary_farend); + + // Set averaged far and near end spectra to zero. + memset(self->mean_far_spectrum, 0, + sizeof(SpectrumType) * self->spectrum_size); + // Reset initialization indicators. + self->far_spectrum_initialized = 0; + + return 0; +} + +void WebRtc_SoftResetDelayEstimatorFarend(void* handle, int delay_shift) { + DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle; + RTC_DCHECK(self); + WebRtc_SoftResetBinaryDelayEstimatorFarend(self->binary_farend, delay_shift); +} + +int WebRtc_AddFarSpectrumFix(void* handle, + const uint16_t* far_spectrum, + int spectrum_size, + int far_q) { + DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle; + uint32_t binary_spectrum = 0; + + if (self == NULL) { + return -1; + } + if (far_spectrum == NULL) { + // Empty far end spectrum. + return -1; + } + if (spectrum_size != self->spectrum_size) { + // Data sizes don't match. + return -1; + } + if (far_q > 15) { + // If |far_q| is larger than 15 we cannot guarantee no wrap around. + return -1; + } + + // Get binary spectrum. + binary_spectrum = BinarySpectrumFix(far_spectrum, self->mean_far_spectrum, + far_q, &(self->far_spectrum_initialized)); + WebRtc_AddBinaryFarSpectrum(self->binary_farend, binary_spectrum); + + return 0; +} + +int WebRtc_AddFarSpectrumFloat(void* handle, + const float* far_spectrum, + int spectrum_size) { + DelayEstimatorFarend* self = (DelayEstimatorFarend*) handle; + uint32_t binary_spectrum = 0; + + if (self == NULL) { + return -1; + } + if (far_spectrum == NULL) { + // Empty far end spectrum. + return -1; + } + if (spectrum_size != self->spectrum_size) { + // Data sizes don't match. + return -1; + } + + // Get binary spectrum. + binary_spectrum = BinarySpectrumFloat(far_spectrum, self->mean_far_spectrum, + &(self->far_spectrum_initialized)); + WebRtc_AddBinaryFarSpectrum(self->binary_farend, binary_spectrum); + + return 0; +} + +void WebRtc_FreeDelayEstimator(void* handle) { + DelayEstimator* self = (DelayEstimator*) handle; + + if (handle == NULL) { + return; + } + + free(self->mean_near_spectrum); + self->mean_near_spectrum = NULL; + + WebRtc_FreeBinaryDelayEstimator(self->binary_handle); + self->binary_handle = NULL; + + free(self); +} + +void* WebRtc_CreateDelayEstimator(void* farend_handle, int max_lookahead) { + DelayEstimator* self = NULL; + DelayEstimatorFarend* farend = (DelayEstimatorFarend*) farend_handle; + + if (farend_handle != NULL) { + self = static_cast<DelayEstimator*>(malloc(sizeof(DelayEstimator))); + } + + if (self != NULL) { + int memory_fail = 0; + + // Allocate memory for the farend spectrum handling. + self->binary_handle = + WebRtc_CreateBinaryDelayEstimator(farend->binary_farend, max_lookahead); + memory_fail |= (self->binary_handle == NULL); + + // Allocate memory for spectrum buffers. + self->mean_near_spectrum = static_cast<SpectrumType*>( + malloc(farend->spectrum_size * sizeof(SpectrumType))); + memory_fail |= (self->mean_near_spectrum == NULL); + + self->spectrum_size = farend->spectrum_size; + + if (memory_fail) { + WebRtc_FreeDelayEstimator(self); + self = NULL; + } + } + + return self; +} + +int WebRtc_InitDelayEstimator(void* handle) { + DelayEstimator* self = (DelayEstimator*) handle; + + if (self == NULL) { + return -1; + } + + // Initialize binary delay estimator. + WebRtc_InitBinaryDelayEstimator(self->binary_handle); + + // Set averaged far and near end spectra to zero. + memset(self->mean_near_spectrum, 0, + sizeof(SpectrumType) * self->spectrum_size); + // Reset initialization indicators. + self->near_spectrum_initialized = 0; + + return 0; +} + +int WebRtc_SoftResetDelayEstimator(void* handle, int delay_shift) { + DelayEstimator* self = (DelayEstimator*) handle; + RTC_DCHECK(self); + return WebRtc_SoftResetBinaryDelayEstimator(self->binary_handle, delay_shift); +} + +int WebRtc_set_history_size(void* handle, int history_size) { + DelayEstimator* self = static_cast<DelayEstimator*>(handle); + + if ((self == NULL) || (history_size <= 1)) { + return -1; + } + return WebRtc_AllocateHistoryBufferMemory(self->binary_handle, history_size); +} + +int WebRtc_history_size(const void* handle) { + const DelayEstimator* self = static_cast<const DelayEstimator*>(handle); + + if (self == NULL) { + return -1; + } + if (self->binary_handle->farend->history_size != + self->binary_handle->history_size) { + // Non matching history sizes. + return -1; + } + return self->binary_handle->history_size; +} + +int WebRtc_set_lookahead(void* handle, int lookahead) { + DelayEstimator* self = (DelayEstimator*) handle; + RTC_DCHECK(self); + RTC_DCHECK(self->binary_handle); + if ((lookahead > self->binary_handle->near_history_size - 1) || + (lookahead < 0)) { + return -1; + } + self->binary_handle->lookahead = lookahead; + return self->binary_handle->lookahead; +} + +int WebRtc_lookahead(void* handle) { + DelayEstimator* self = (DelayEstimator*) handle; + RTC_DCHECK(self); + RTC_DCHECK(self->binary_handle); + return self->binary_handle->lookahead; +} + +int WebRtc_set_allowed_offset(void* handle, int allowed_offset) { + DelayEstimator* self = (DelayEstimator*) handle; + + if ((self == NULL) || (allowed_offset < 0)) { + return -1; + } + self->binary_handle->allowed_offset = allowed_offset; + return 0; +} + +int WebRtc_get_allowed_offset(const void* handle) { + const DelayEstimator* self = (const DelayEstimator*) handle; + + if (self == NULL) { + return -1; + } + return self->binary_handle->allowed_offset; +} + +int WebRtc_enable_robust_validation(void* handle, int enable) { + DelayEstimator* self = (DelayEstimator*) handle; + + if (self == NULL) { + return -1; + } + if ((enable < 0) || (enable > 1)) { + return -1; + } + RTC_DCHECK(self->binary_handle); + self->binary_handle->robust_validation_enabled = enable; + return 0; +} + +int WebRtc_is_robust_validation_enabled(const void* handle) { + const DelayEstimator* self = (const DelayEstimator*) handle; + + if (self == NULL) { + return -1; + } + return self->binary_handle->robust_validation_enabled; +} + +int WebRtc_DelayEstimatorProcessFix(void* handle, + const uint16_t* near_spectrum, + int spectrum_size, + int near_q) { + DelayEstimator* self = (DelayEstimator*) handle; + uint32_t binary_spectrum = 0; + + if (self == NULL) { + return -1; + } + if (near_spectrum == NULL) { + // Empty near end spectrum. + return -1; + } + if (spectrum_size != self->spectrum_size) { + // Data sizes don't match. + return -1; + } + if (near_q > 15) { + // If |near_q| is larger than 15 we cannot guarantee no wrap around. + return -1; + } + + // Get binary spectra. + binary_spectrum = BinarySpectrumFix(near_spectrum, + self->mean_near_spectrum, + near_q, + &(self->near_spectrum_initialized)); + + return WebRtc_ProcessBinarySpectrum(self->binary_handle, binary_spectrum); +} + +int WebRtc_DelayEstimatorProcessFloat(void* handle, + const float* near_spectrum, + int spectrum_size) { + DelayEstimator* self = (DelayEstimator*) handle; + uint32_t binary_spectrum = 0; + + if (self == NULL) { + return -1; + } + if (near_spectrum == NULL) { + // Empty near end spectrum. + return -1; + } + if (spectrum_size != self->spectrum_size) { + // Data sizes don't match. + return -1; + } + + // Get binary spectrum. + binary_spectrum = BinarySpectrumFloat(near_spectrum, self->mean_near_spectrum, + &(self->near_spectrum_initialized)); + + return WebRtc_ProcessBinarySpectrum(self->binary_handle, binary_spectrum); +} + +int WebRtc_last_delay(void* handle) { + DelayEstimator* self = (DelayEstimator*) handle; + + if (self == NULL) { + return -1; + } + + return WebRtc_binary_last_delay(self->binary_handle); +} + +float WebRtc_last_delay_quality(void* handle) { + DelayEstimator* self = (DelayEstimator*) handle; + RTC_DCHECK(self); + return WebRtc_binary_last_delay_quality(self->binary_handle); +} diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h new file mode 100644 index 0000000000..6b6e51f82c --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Performs delay estimation on block by block basis. +// The return value is 0 - OK and -1 - Error, unless otherwise stated. + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_WRAPPER_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_WRAPPER_H_ + +#include "typedefs.h" // NOLINT(build/include) + +// Releases the memory allocated by WebRtc_CreateDelayEstimatorFarend(...) +void WebRtc_FreeDelayEstimatorFarend(void* handle); + +// Allocates the memory needed by the far-end part of the delay estimation. The +// memory needs to be initialized separately through +// WebRtc_InitDelayEstimatorFarend(...). +// +// Inputs: +// - spectrum_size : Size of the spectrum used both in far-end and +// near-end. Used to allocate memory for spectrum +// specific buffers. +// - history_size : The far-end history buffer size. A change in buffer +// size can be forced with WebRtc_set_history_size(). +// Note that the maximum delay which can be estimated is +// determined together with WebRtc_set_lookahead(). +// +// Return value: +// - void* : Created |handle|. If the memory can't be allocated or +// if any of the input parameters are invalid NULL is +// returned. +void* WebRtc_CreateDelayEstimatorFarend(int spectrum_size, int history_size); + +// Initializes the far-end part of the delay estimation instance returned by +// WebRtc_CreateDelayEstimatorFarend(...) +int WebRtc_InitDelayEstimatorFarend(void* handle); + +// Soft resets the far-end part of the delay estimation instance returned by +// WebRtc_CreateDelayEstimatorFarend(...). +// Input: +// - delay_shift : The amount of blocks to shift history buffers. +void WebRtc_SoftResetDelayEstimatorFarend(void* handle, int delay_shift); + +// Adds the far-end spectrum to the far-end history buffer. This spectrum is +// used as reference when calculating the delay using +// WebRtc_ProcessSpectrum(). +// +// Inputs: +// - far_spectrum : Far-end spectrum. +// - spectrum_size : The size of the data arrays (same for both far- and +// near-end). +// - far_q : The Q-domain of the far-end data. +// +// Output: +// - handle : Updated far-end instance. +// +int WebRtc_AddFarSpectrumFix(void* handle, + const uint16_t* far_spectrum, + int spectrum_size, + int far_q); + +// See WebRtc_AddFarSpectrumFix() for description. +int WebRtc_AddFarSpectrumFloat(void* handle, + const float* far_spectrum, + int spectrum_size); + +// Releases the memory allocated by WebRtc_CreateDelayEstimator(...) +void WebRtc_FreeDelayEstimator(void* handle); + +// Allocates the memory needed by the delay estimation. The memory needs to be +// initialized separately through WebRtc_InitDelayEstimator(...). +// +// Inputs: +// - farend_handle : Pointer to the far-end part of the delay estimation +// instance created prior to this call using +// WebRtc_CreateDelayEstimatorFarend(). +// +// Note that WebRtc_CreateDelayEstimator does not take +// ownership of |farend_handle|, which has to be torn +// down properly after this instance. +// +// - max_lookahead : Maximum amount of non-causal lookahead allowed. The +// actual amount of lookahead used can be controlled by +// WebRtc_set_lookahead(...). The default |lookahead| is +// set to |max_lookahead| at create time. Use +// WebRtc_set_lookahead(...) before start if a different +// value is desired. +// +// Using lookahead can detect cases in which a near-end +// signal occurs before the corresponding far-end signal. +// It will delay the estimate for the current block by an +// equal amount, and the returned values will be offset +// by it. +// +// A value of zero is the typical no-lookahead case. +// This also represents the minimum delay which can be +// estimated. +// +// Note that the effective range of delay estimates is +// [-|lookahead|,... ,|history_size|-|lookahead|) +// where |history_size| is set through +// WebRtc_set_history_size(). +// +// Return value: +// - void* : Created |handle|. If the memory can't be allocated or +// if any of the input parameters are invalid NULL is +// returned. +void* WebRtc_CreateDelayEstimator(void* farend_handle, int max_lookahead); + +// Initializes the delay estimation instance returned by +// WebRtc_CreateDelayEstimator(...) +int WebRtc_InitDelayEstimator(void* handle); + +// Soft resets the delay estimation instance returned by +// WebRtc_CreateDelayEstimator(...) +// Input: +// - delay_shift : The amount of blocks to shift history buffers. +// +// Return value: +// - actual_shifts : The actual number of shifts performed. +int WebRtc_SoftResetDelayEstimator(void* handle, int delay_shift); + +// Sets the effective |history_size| used. Valid values from 2. We simply need +// at least two delays to compare to perform an estimate. If |history_size| is +// changed, buffers are reallocated filling in with zeros if necessary. +// Note that changing the |history_size| affects both buffers in far-end and +// near-end. Hence it is important to change all DelayEstimators that use the +// same reference far-end, to the same |history_size| value. +// Inputs: +// - handle : Pointer to the delay estimation instance. +// - history_size : Effective history size to be used. +// Return value: +// - new_history_size : The new history size used. If the memory was not able +// to be allocated 0 is returned. +int WebRtc_set_history_size(void* handle, int history_size); + +// Returns the history_size currently used. +// Input: +// - handle : Pointer to the delay estimation instance. +int WebRtc_history_size(const void* handle); + +// Sets the amount of |lookahead| to use. Valid values are [0, max_lookahead] +// where |max_lookahead| was set at create time through +// WebRtc_CreateDelayEstimator(...). +// +// Input: +// - handle : Pointer to the delay estimation instance. +// - lookahead : The amount of lookahead to be used. +// +// Return value: +// - new_lookahead : The actual amount of lookahead set, unless |handle| is +// a NULL pointer or |lookahead| is invalid, for which an +// error is returned. +int WebRtc_set_lookahead(void* handle, int lookahead); + +// Returns the amount of lookahead we currently use. +// Input: +// - handle : Pointer to the delay estimation instance. +int WebRtc_lookahead(void* handle); + +// Sets the |allowed_offset| used in the robust validation scheme. If the +// delay estimator is used in an echo control component, this parameter is +// related to the filter length. In principle |allowed_offset| should be set to +// the echo control filter length minus the expected echo duration, i.e., the +// delay offset the echo control can handle without quality regression. The +// default value, used if not set manually, is zero. Note that |allowed_offset| +// has to be non-negative. +// Inputs: +// - handle : Pointer to the delay estimation instance. +// - allowed_offset : The amount of delay offset, measured in partitions, +// the echo control filter can handle. +int WebRtc_set_allowed_offset(void* handle, int allowed_offset); + +// Returns the |allowed_offset| in number of partitions. +int WebRtc_get_allowed_offset(const void* handle); + +// Enables/Disables a robust validation functionality in the delay estimation. +// This is by default set to disabled at create time. The state is preserved +// over a reset. +// Inputs: +// - handle : Pointer to the delay estimation instance. +// - enable : Enable (1) or disable (0) this feature. +int WebRtc_enable_robust_validation(void* handle, int enable); + +// Returns 1 if robust validation is enabled and 0 if disabled. +int WebRtc_is_robust_validation_enabled(const void* handle); + +// Estimates and returns the delay between the far-end and near-end blocks. The +// value will be offset by the lookahead (i.e. the lookahead should be +// subtracted from the returned value). +// Inputs: +// - handle : Pointer to the delay estimation instance. +// - near_spectrum : Pointer to the near-end spectrum data of the current +// block. +// - spectrum_size : The size of the data arrays (same for both far- and +// near-end). +// - near_q : The Q-domain of the near-end data. +// +// Output: +// - handle : Updated instance. +// +// Return value: +// - delay : >= 0 - Calculated delay value. +// -1 - Error. +// -2 - Insufficient data for estimation. +int WebRtc_DelayEstimatorProcessFix(void* handle, + const uint16_t* near_spectrum, + int spectrum_size, + int near_q); + +// See WebRtc_DelayEstimatorProcessFix() for description. +int WebRtc_DelayEstimatorProcessFloat(void* handle, + const float* near_spectrum, + int spectrum_size); + +// Returns the last calculated delay updated by the function +// WebRtc_DelayEstimatorProcess(...). +// +// Input: +// - handle : Pointer to the delay estimation instance. +// +// Return value: +// - delay : >= 0 - Last calculated delay value. +// -1 - Error. +// -2 - Insufficient data for estimation. +int WebRtc_last_delay(void* handle); + +// Returns the estimation quality/probability of the last calculated delay +// updated by the function WebRtc_DelayEstimatorProcess(...). The estimation +// quality is a value in the interval [0, 1]. The higher the value, the better +// the quality. +// +// Return value: +// - delay_quality : >= 0 - Estimation quality of last calculated delay. +float WebRtc_last_delay_quality(void* handle); + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_DELAY_ESTIMATOR_WRAPPER_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.cc new file mode 100644 index 0000000000..59631117a2 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.cc @@ -0,0 +1,543 @@ +/* + * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html + * Copyright Takuya OOURA, 1996-2001 + * + * You may use, copy, modify and distribute this code for any purpose (include + * commercial use) and without fee. Please refer to this package when you modify + * this code. + * + * Changes by the WebRTC authors: + * - Trivial type modifications. + * - Minimal code subset to do rdft of length 128. + * - Optimizations because of known length. + * - Removed the global variables by moving the code in to a class in order + * to make it thread safe. + * + * All changes are covered by the WebRTC license and IP grant: + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing//utility/ooura_fft.h" + +#include <math.h> + +#include "modules/audio_processing/utility/ooura_fft_tables_common.h" +#include "system_wrappers/include/cpu_features_wrapper.h" +#include "typedefs.h" // NOLINT(build/include) + +namespace webrtc { + +namespace { + +#if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON)) +static void cft1st_128_C(float* a) { + const int n = 128; + int j, k1, k2; + float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + // The processing of the first set of elements was simplified in C to avoid + // some operations (multiplication by zero or one, addition of two elements + // multiplied by the same weight, ...). + x0r = a[0] + a[2]; + x0i = a[1] + a[3]; + x1r = a[0] - a[2]; + x1i = a[1] - a[3]; + x2r = a[4] + a[6]; + x2i = a[5] + a[7]; + x3r = a[4] - a[6]; + x3i = a[5] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; + wk1r = rdft_w[2]; + x0r = a[8] + a[10]; + x0i = a[9] + a[11]; + x1r = a[8] - a[10]; + x1i = a[9] - a[11]; + x2r = a[12] + a[14]; + x2i = a[13] + a[15]; + x3r = a[12] - a[14]; + x3i = a[13] - a[15]; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[12] = x2i - x0i; + a[13] = x0r - x2r; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[10] = wk1r * (x0r - x0i); + a[11] = wk1r * (x0r + x0i); + x0r = x3i + x1r; + x0i = x3r - x1i; + a[14] = wk1r * (x0i - x0r); + a[15] = wk1r * (x0i + x0r); + k1 = 0; + for (j = 16; j < n; j += 16) { + k1 += 2; + k2 = 2 * k1; + wk2r = rdft_w[k1 + 0]; + wk2i = rdft_w[k1 + 1]; + wk1r = rdft_w[k2 + 0]; + wk1i = rdft_w[k2 + 1]; + wk3r = rdft_wk3ri_first[k1 + 0]; + wk3i = rdft_wk3ri_first[k1 + 1]; + x0r = a[j + 0] + a[j + 2]; + x0i = a[j + 1] + a[j + 3]; + x1r = a[j + 0] - a[j + 2]; + x1i = a[j + 1] - a[j + 3]; + x2r = a[j + 4] + a[j + 6]; + x2i = a[j + 5] + a[j + 7]; + x3r = a[j + 4] - a[j + 6]; + x3i = a[j + 5] - a[j + 7]; + a[j + 0] = x0r + x2r; + a[j + 1] = x0i + x2i; + x0r -= x2r; + x0i -= x2i; + a[j + 4] = wk2r * x0r - wk2i * x0i; + a[j + 5] = wk2r * x0i + wk2i * x0r; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j + 2] = wk1r * x0r - wk1i * x0i; + a[j + 3] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j + 6] = wk3r * x0r - wk3i * x0i; + a[j + 7] = wk3r * x0i + wk3i * x0r; + wk1r = rdft_w[k2 + 2]; + wk1i = rdft_w[k2 + 3]; + wk3r = rdft_wk3ri_second[k1 + 0]; + wk3i = rdft_wk3ri_second[k1 + 1]; + x0r = a[j + 8] + a[j + 10]; + x0i = a[j + 9] + a[j + 11]; + x1r = a[j + 8] - a[j + 10]; + x1i = a[j + 9] - a[j + 11]; + x2r = a[j + 12] + a[j + 14]; + x2i = a[j + 13] + a[j + 15]; + x3r = a[j + 12] - a[j + 14]; + x3i = a[j + 13] - a[j + 15]; + a[j + 8] = x0r + x2r; + a[j + 9] = x0i + x2i; + x0r -= x2r; + x0i -= x2i; + a[j + 12] = -wk2i * x0r - wk2r * x0i; + a[j + 13] = -wk2i * x0i + wk2r * x0r; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j + 10] = wk1r * x0r - wk1i * x0i; + a[j + 11] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j + 14] = wk3r * x0r - wk3i * x0i; + a[j + 15] = wk3r * x0i + wk3i * x0r; + } +} + +static void cftmdl_128_C(float* a) { + const int l = 8; + const int n = 128; + const int m = 32; + int j0, j1, j2, j3, k, k1, k2, m2; + float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + for (j0 = 0; j0 < l; j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; + x2i = a[j2 + 1] + a[j3 + 1]; + x3r = a[j2 + 0] - a[j3 + 0]; + x3i = a[j2 + 1] - a[j3 + 1]; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j2 + 0] = x0r - x2r; + a[j2 + 1] = x0i - x2i; + a[j1 + 0] = x1r - x3i; + a[j1 + 1] = x1i + x3r; + a[j3 + 0] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + } + wk1r = rdft_w[2]; + for (j0 = m; j0 < l + m; j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; + x2i = a[j2 + 1] + a[j3 + 1]; + x3r = a[j2 + 0] - a[j3 + 0]; + x3i = a[j2 + 1] - a[j3 + 1]; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j2 + 0] = x2i - x0i; + a[j2 + 1] = x0r - x2r; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j1 + 0] = wk1r * (x0r - x0i); + a[j1 + 1] = wk1r * (x0r + x0i); + x0r = x3i + x1r; + x0i = x3r - x1i; + a[j3 + 0] = wk1r * (x0i - x0r); + a[j3 + 1] = wk1r * (x0i + x0r); + } + k1 = 0; + m2 = 2 * m; + for (k = m2; k < n; k += m2) { + k1 += 2; + k2 = 2 * k1; + wk2r = rdft_w[k1 + 0]; + wk2i = rdft_w[k1 + 1]; + wk1r = rdft_w[k2 + 0]; + wk1i = rdft_w[k2 + 1]; + wk3r = rdft_wk3ri_first[k1 + 0]; + wk3i = rdft_wk3ri_first[k1 + 1]; + for (j0 = k; j0 < l + k; j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; + x2i = a[j2 + 1] + a[j3 + 1]; + x3r = a[j2 + 0] - a[j3 + 0]; + x3i = a[j2 + 1] - a[j3 + 1]; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + x0r -= x2r; + x0i -= x2i; + a[j2 + 0] = wk2r * x0r - wk2i * x0i; + a[j2 + 1] = wk2r * x0i + wk2i * x0r; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j1 + 0] = wk1r * x0r - wk1i * x0i; + a[j1 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 0] = wk3r * x0r - wk3i * x0i; + a[j3 + 1] = wk3r * x0i + wk3i * x0r; + } + wk1r = rdft_w[k2 + 2]; + wk1i = rdft_w[k2 + 3]; + wk3r = rdft_wk3ri_second[k1 + 0]; + wk3i = rdft_wk3ri_second[k1 + 1]; + for (j0 = k + m; j0 < l + (k + m); j0 += 2) { + j1 = j0 + 8; + j2 = j0 + 16; + j3 = j0 + 24; + x0r = a[j0 + 0] + a[j1 + 0]; + x0i = a[j0 + 1] + a[j1 + 1]; + x1r = a[j0 + 0] - a[j1 + 0]; + x1i = a[j0 + 1] - a[j1 + 1]; + x2r = a[j2 + 0] + a[j3 + 0]; + x2i = a[j2 + 1] + a[j3 + 1]; + x3r = a[j2 + 0] - a[j3 + 0]; + x3i = a[j2 + 1] - a[j3 + 1]; + a[j0 + 0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + x0r -= x2r; + x0i -= x2i; + a[j2 + 0] = -wk2i * x0r - wk2r * x0i; + a[j2 + 1] = -wk2i * x0i + wk2r * x0r; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j1 + 0] = wk1r * x0r - wk1i * x0i; + a[j1 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 0] = wk3r * x0r - wk3i * x0i; + a[j3 + 1] = wk3r * x0i + wk3i * x0r; + } + } +} + +static void rftfsub_128_C(float* a) { + const float* c = rdft_w + 32; + int j1, j2, k1, k2; + float wkr, wki, xr, xi, yr, yi; + + for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { + k2 = 128 - j2; + k1 = 32 - j1; + wkr = 0.5f - c[k1]; + wki = c[j1]; + xr = a[j2 + 0] - a[k2 + 0]; + xi = a[j2 + 1] + a[k2 + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j2 + 0] -= yr; + a[j2 + 1] -= yi; + a[k2 + 0] += yr; + a[k2 + 1] -= yi; + } +} + +static void rftbsub_128_C(float* a) { + const float* c = rdft_w + 32; + int j1, j2, k1, k2; + float wkr, wki, xr, xi, yr, yi; + + a[1] = -a[1]; + for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { + k2 = 128 - j2; + k1 = 32 - j1; + wkr = 0.5f - c[k1]; + wki = c[j1]; + xr = a[j2 + 0] - a[k2 + 0]; + xi = a[j2 + 1] + a[k2 + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j2 + 0] = a[j2 + 0] - yr; + a[j2 + 1] = yi - a[j2 + 1]; + a[k2 + 0] = yr + a[k2 + 0]; + a[k2 + 1] = yi - a[k2 + 1]; + } + a[65] = -a[65]; +} +#endif + + +} // namespace + +OouraFft::OouraFft() { +#if defined(WEBRTC_ARCH_X86_FAMILY) + use_sse2_ = (WebRtc_GetCPUInfo(kSSE2) != 0); +#else + use_sse2_ = false; +#endif +} + +OouraFft::~OouraFft() = default; + +void OouraFft::Fft(float* a) const { + float xi; + bitrv2_128(a); + cftfsub_128(a); + rftfsub_128(a); + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; +} +void OouraFft::InverseFft(float* a) const { + a[1] = 0.5f * (a[0] - a[1]); + a[0] -= a[1]; + rftbsub_128(a); + bitrv2_128(a); + cftbsub_128(a); +} + +void OouraFft::cft1st_128(float* a) const { +#if defined(MIPS_FPU_LE) + cft1st_128_mips(a); +#elif defined(WEBRTC_HAS_NEON) + cft1st_128_neon(a); +#elif defined(WEBRTC_ARCH_X86_FAMILY) + if (use_sse2_) { + cft1st_128_SSE2(a); + } else { + cft1st_128_C(a); + } +#else + cft1st_128_C(a); +#endif +} +void OouraFft::cftmdl_128(float* a) const { +#if defined(MIPS_FPU_LE) + cftmdl_128_mips(a); +#elif defined(WEBRTC_HAS_NEON) + cftmdl_128_neon(a); +#elif defined(WEBRTC_ARCH_X86_FAMILY) + if (use_sse2_) { + cftmdl_128_SSE2(a); + } else { + cftmdl_128_C(a); + } +#else + cftmdl_128_C(a); +#endif +} +void OouraFft::rftfsub_128(float* a) const { +#if defined(MIPS_FPU_LE) + rftfsub_128_mips(a); +#elif defined(WEBRTC_HAS_NEON) + rftfsub_128_neon(a); +#elif defined(WEBRTC_ARCH_X86_FAMILY) + if (use_sse2_) { + rftfsub_128_SSE2(a); + } else { + rftfsub_128_C(a); + } +#else + rftfsub_128_C(a); +#endif +} + +void OouraFft::rftbsub_128(float* a) const { +#if defined(MIPS_FPU_LE) + rftbsub_128_mips(a); +#elif defined(WEBRTC_HAS_NEON) + rftbsub_128_neon(a); +#elif defined(WEBRTC_ARCH_X86_FAMILY) + if (use_sse2_) { + rftbsub_128_SSE2(a); + } else { + rftbsub_128_C(a); + } +#else + rftbsub_128_C(a); +#endif +} + +void OouraFft::cftbsub_128(float* a) const { + int j, j1, j2, j3, l; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + cft1st_128(a); + cftmdl_128(a); + l = 32; + + for (j = 0; j < l; j += 2) { + j1 = j + l; + j2 = j1 + l; + j3 = j2 + l; + x0r = a[j] + a[j1]; + x0i = -a[j + 1] - a[j1 + 1]; + x1r = a[j] - a[j1]; + x1i = -a[j + 1] + a[j1 + 1]; + x2r = a[j2] + a[j3]; + x2i = a[j2 + 1] + a[j3 + 1]; + x3r = a[j2] - a[j3]; + x3i = a[j2 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j2] = x0r - x2r; + a[j2 + 1] = x0i + x2i; + a[j1] = x1r - x3i; + a[j1 + 1] = x1i - x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i + x3r; + } +} + +void OouraFft::cftfsub_128(float* a) const { + int j, j1, j2, j3, l; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + cft1st_128(a); + cftmdl_128(a); + l = 32; + for (j = 0; j < l; j += 2) { + j1 = j + l; + j2 = j1 + l; + j3 = j2 + l; + x0r = a[j] + a[j1]; + x0i = a[j + 1] + a[j1 + 1]; + x1r = a[j] - a[j1]; + x1i = a[j + 1] - a[j1 + 1]; + x2r = a[j2] + a[j3]; + x2i = a[j2 + 1] + a[j3 + 1]; + x3r = a[j2] - a[j3]; + x3i = a[j2 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j2] = x0r - x2r; + a[j2 + 1] = x0i - x2i; + a[j1] = x1r - x3i; + a[j1 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + } +} + +void OouraFft::bitrv2_128(float* a) const { + /* + Following things have been attempted but are no faster: + (a) Storing the swap indexes in a LUT (index calculations are done + for 'free' while waiting on memory/L1). + (b) Consolidate the load/store of two consecutive floats by a 64 bit + integer (execution is memory/L1 bound). + (c) Do a mix of floats and 64 bit integer to maximize register + utilization (execution is memory/L1 bound). + (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). + (e) Hard-coding of the offsets to completely eliminates index + calculations. + */ + + unsigned int j, j1, k, k1; + float xr, xi, yr, yi; + + const int ip[4] = {0, 64, 32, 96}; + for (k = 0; k < 4; k++) { + for (j = 0; j < k; j++) { + j1 = 2 * j + ip[k]; + k1 = 2 * k + ip[j]; + xr = a[j1 + 0]; + xi = a[j1 + 1]; + yr = a[k1 + 0]; + yi = a[k1 + 1]; + a[j1 + 0] = yr; + a[j1 + 1] = yi; + a[k1 + 0] = xr; + a[k1 + 1] = xi; + j1 += 8; + k1 += 16; + xr = a[j1 + 0]; + xi = a[j1 + 1]; + yr = a[k1 + 0]; + yi = a[k1 + 1]; + a[j1 + 0] = yr; + a[j1 + 1] = yi; + a[k1 + 0] = xr; + a[k1 + 1] = xi; + j1 += 8; + k1 -= 8; + xr = a[j1 + 0]; + xi = a[j1 + 1]; + yr = a[k1 + 0]; + yi = a[k1 + 1]; + a[j1 + 0] = yr; + a[j1 + 1] = yi; + a[k1 + 0] = xr; + a[k1 + 1] = xi; + j1 += 8; + k1 += 16; + xr = a[j1 + 0]; + xi = a[j1 + 1]; + yr = a[k1 + 0]; + yi = a[k1 + 1]; + a[j1 + 0] = yr; + a[j1 + 1] = yi; + a[k1 + 0] = xr; + a[k1 + 1] = xi; + } + j1 = 2 * k + 8 + ip[k]; + k1 = j1 + 8; + xr = a[j1 + 0]; + xi = a[j1 + 1]; + yr = a[k1 + 0]; + yi = a[k1 + 1]; + a[j1 + 0] = yr; + a[j1 + 1] = yi; + a[k1 + 0] = xr; + a[k1 + 1] = xi; + } +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.h new file mode 100644 index 0000000000..96d57dc908 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_ + +#include "typedefs.h" // NOLINT(build/include) + +namespace webrtc { + +#if defined(WEBRTC_ARCH_X86_FAMILY) +void cft1st_128_SSE2(float* a); +void cftmdl_128_SSE2(float* a); +void rftfsub_128_SSE2(float* a); +void rftbsub_128_SSE2(float* a); +#endif + +#if defined(MIPS_FPU_LE) +void cft1st_128_mips(float* a); +void cftmdl_128_mips(float* a); +void rftfsub_128_mips(float* a); +void rftbsub_128_mips(float* a); +#endif + +#if defined(WEBRTC_HAS_NEON) +void cft1st_128_neon(float* a); +void cftmdl_128_neon(float* a); +void rftfsub_128_neon(float* a); +void rftbsub_128_neon(float* a); +#endif + +class OouraFft { + public: + OouraFft(); + ~OouraFft(); + void Fft(float* a) const; + void InverseFft(float* a) const; + + private: + void cft1st_128(float* a) const; + void cftmdl_128(float* a) const; + void rftfsub_128(float* a) const; + void rftbsub_128(float* a) const; + + void cftfsub_128(float* a) const; + void cftbsub_128(float* a) const; + void bitrv2_128(float* a) const; + bool use_sse2_; +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_mips.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_mips.cc new file mode 100644 index 0000000000..569e1d7e82 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_mips.cc @@ -0,0 +1,1185 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/utility/ooura_fft.h" + +#include "modules/audio_processing/utility/ooura_fft_tables_common.h" +#include "typedefs.h" // NOLINT(build/include) + +namespace webrtc { + +#if defined(MIPS_FPU_LE) +void bitrv2_128_mips(float* a) { + // n is 128 + float xr, xi, yr, yi; + + xr = a[8]; + xi = a[9]; + yr = a[16]; + yi = a[17]; + a[8] = yr; + a[9] = yi; + a[16] = xr; + a[17] = xi; + + xr = a[64]; + xi = a[65]; + yr = a[2]; + yi = a[3]; + a[64] = yr; + a[65] = yi; + a[2] = xr; + a[3] = xi; + + xr = a[72]; + xi = a[73]; + yr = a[18]; + yi = a[19]; + a[72] = yr; + a[73] = yi; + a[18] = xr; + a[19] = xi; + + xr = a[80]; + xi = a[81]; + yr = a[10]; + yi = a[11]; + a[80] = yr; + a[81] = yi; + a[10] = xr; + a[11] = xi; + + xr = a[88]; + xi = a[89]; + yr = a[26]; + yi = a[27]; + a[88] = yr; + a[89] = yi; + a[26] = xr; + a[27] = xi; + + xr = a[74]; + xi = a[75]; + yr = a[82]; + yi = a[83]; + a[74] = yr; + a[75] = yi; + a[82] = xr; + a[83] = xi; + + xr = a[32]; + xi = a[33]; + yr = a[4]; + yi = a[5]; + a[32] = yr; + a[33] = yi; + a[4] = xr; + a[5] = xi; + + xr = a[40]; + xi = a[41]; + yr = a[20]; + yi = a[21]; + a[40] = yr; + a[41] = yi; + a[20] = xr; + a[21] = xi; + + xr = a[48]; + xi = a[49]; + yr = a[12]; + yi = a[13]; + a[48] = yr; + a[49] = yi; + a[12] = xr; + a[13] = xi; + + xr = a[56]; + xi = a[57]; + yr = a[28]; + yi = a[29]; + a[56] = yr; + a[57] = yi; + a[28] = xr; + a[29] = xi; + + xr = a[34]; + xi = a[35]; + yr = a[68]; + yi = a[69]; + a[34] = yr; + a[35] = yi; + a[68] = xr; + a[69] = xi; + + xr = a[42]; + xi = a[43]; + yr = a[84]; + yi = a[85]; + a[42] = yr; + a[43] = yi; + a[84] = xr; + a[85] = xi; + + xr = a[50]; + xi = a[51]; + yr = a[76]; + yi = a[77]; + a[50] = yr; + a[51] = yi; + a[76] = xr; + a[77] = xi; + + xr = a[58]; + xi = a[59]; + yr = a[92]; + yi = a[93]; + a[58] = yr; + a[59] = yi; + a[92] = xr; + a[93] = xi; + + xr = a[44]; + xi = a[45]; + yr = a[52]; + yi = a[53]; + a[44] = yr; + a[45] = yi; + a[52] = xr; + a[53] = xi; + + xr = a[96]; + xi = a[97]; + yr = a[6]; + yi = a[7]; + a[96] = yr; + a[97] = yi; + a[6] = xr; + a[7] = xi; + + xr = a[104]; + xi = a[105]; + yr = a[22]; + yi = a[23]; + a[104] = yr; + a[105] = yi; + a[22] = xr; + a[23] = xi; + + xr = a[112]; + xi = a[113]; + yr = a[14]; + yi = a[15]; + a[112] = yr; + a[113] = yi; + a[14] = xr; + a[15] = xi; + + xr = a[120]; + xi = a[121]; + yr = a[30]; + yi = a[31]; + a[120] = yr; + a[121] = yi; + a[30] = xr; + a[31] = xi; + + xr = a[98]; + xi = a[99]; + yr = a[70]; + yi = a[71]; + a[98] = yr; + a[99] = yi; + a[70] = xr; + a[71] = xi; + + xr = a[106]; + xi = a[107]; + yr = a[86]; + yi = a[87]; + a[106] = yr; + a[107] = yi; + a[86] = xr; + a[87] = xi; + + xr = a[114]; + xi = a[115]; + yr = a[78]; + yi = a[79]; + a[114] = yr; + a[115] = yi; + a[78] = xr; + a[79] = xi; + + xr = a[122]; + xi = a[123]; + yr = a[94]; + yi = a[95]; + a[122] = yr; + a[123] = yi; + a[94] = xr; + a[95] = xi; + + xr = a[100]; + xi = a[101]; + yr = a[38]; + yi = a[39]; + a[100] = yr; + a[101] = yi; + a[38] = xr; + a[39] = xi; + + xr = a[108]; + xi = a[109]; + yr = a[54]; + yi = a[55]; + a[108] = yr; + a[109] = yi; + a[54] = xr; + a[55] = xi; + + xr = a[116]; + xi = a[117]; + yr = a[46]; + yi = a[47]; + a[116] = yr; + a[117] = yi; + a[46] = xr; + a[47] = xi; + + xr = a[124]; + xi = a[125]; + yr = a[62]; + yi = a[63]; + a[124] = yr; + a[125] = yi; + a[62] = xr; + a[63] = xi; + + xr = a[110]; + xi = a[111]; + yr = a[118]; + yi = a[119]; + a[110] = yr; + a[111] = yi; + a[118] = xr; + a[119] = xi; +} + +void cft1st_128_mips(float* a) { + float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14; + int a_ptr, p1_rdft, p2_rdft, count; + const float* first = rdft_wk3ri_first; + const float* second = rdft_wk3ri_second; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + // first 8 + "lwc1 %[f0], 0(%[a]) \n\t" + "lwc1 %[f1], 4(%[a]) \n\t" + "lwc1 %[f2], 8(%[a]) \n\t" + "lwc1 %[f3], 12(%[a]) \n\t" + "lwc1 %[f4], 16(%[a]) \n\t" + "lwc1 %[f5], 20(%[a]) \n\t" + "lwc1 %[f6], 24(%[a]) \n\t" + "lwc1 %[f7], 28(%[a]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "add.s %[f7], %[f8], %[f2] \n\t" + "sub.s %[f8], %[f8], %[f2] \n\t" + "sub.s %[f2], %[f1], %[f4] \n\t" + "add.s %[f1], %[f1], %[f4] \n\t" + "add.s %[f4], %[f6], %[f3] \n\t" + "sub.s %[f6], %[f6], %[f3] \n\t" + "sub.s %[f3], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "swc1 %[f7], 0(%[a]) \n\t" + "swc1 %[f8], 16(%[a]) \n\t" + "swc1 %[f2], 28(%[a]) \n\t" + "swc1 %[f1], 12(%[a]) \n\t" + "swc1 %[f4], 4(%[a]) \n\t" + "swc1 %[f6], 20(%[a]) \n\t" + "swc1 %[f3], 8(%[a]) \n\t" + "swc1 %[f0], 24(%[a]) \n\t" + // second 8 + "lwc1 %[f0], 32(%[a]) \n\t" + "lwc1 %[f1], 36(%[a]) \n\t" + "lwc1 %[f2], 40(%[a]) \n\t" + "lwc1 %[f3], 44(%[a]) \n\t" + "lwc1 %[f4], 48(%[a]) \n\t" + "lwc1 %[f5], 52(%[a]) \n\t" + "lwc1 %[f6], 56(%[a]) \n\t" + "lwc1 %[f7], 60(%[a]) \n\t" + "add.s %[f8], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "add.s %[f7], %[f4], %[f1] \n\t" + "sub.s %[f4], %[f4], %[f1] \n\t" + "add.s %[f1], %[f3], %[f8] \n\t" + "sub.s %[f3], %[f3], %[f8] \n\t" + "sub.s %[f8], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "add.s %[f5], %[f6], %[f2] \n\t" + "sub.s %[f6], %[f2], %[f6] \n\t" + "lwc1 %[f9], 8(%[rdft_w]) \n\t" + "sub.s %[f2], %[f8], %[f7] \n\t" + "add.s %[f8], %[f8], %[f7] \n\t" + "sub.s %[f7], %[f4], %[f0] \n\t" + "add.s %[f4], %[f4], %[f0] \n\t" + // prepare for loop + "addiu %[a_ptr], %[a], 64 \n\t" + "addiu %[p1_rdft], %[rdft_w], 8 \n\t" + "addiu %[p2_rdft], %[rdft_w], 16 \n\t" + "addiu %[count], $zero, 7 \n\t" + // finish second 8 + "mul.s %[f2], %[f9], %[f2] \n\t" + "mul.s %[f8], %[f9], %[f8] \n\t" + "mul.s %[f7], %[f9], %[f7] \n\t" + "mul.s %[f4], %[f9], %[f4] \n\t" + "swc1 %[f1], 32(%[a]) \n\t" + "swc1 %[f3], 52(%[a]) \n\t" + "swc1 %[f5], 36(%[a]) \n\t" + "swc1 %[f6], 48(%[a]) \n\t" + "swc1 %[f2], 40(%[a]) \n\t" + "swc1 %[f8], 44(%[a]) \n\t" + "swc1 %[f7], 56(%[a]) \n\t" + "swc1 %[f4], 60(%[a]) \n\t" + // loop + "1: \n\t" + "lwc1 %[f0], 0(%[a_ptr]) \n\t" + "lwc1 %[f1], 4(%[a_ptr]) \n\t" + "lwc1 %[f2], 8(%[a_ptr]) \n\t" + "lwc1 %[f3], 12(%[a_ptr]) \n\t" + "lwc1 %[f4], 16(%[a_ptr]) \n\t" + "lwc1 %[f5], 20(%[a_ptr]) \n\t" + "lwc1 %[f6], 24(%[a_ptr]) \n\t" + "lwc1 %[f7], 28(%[a_ptr]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "lwc1 %[f10], 4(%[p1_rdft]) \n\t" + "lwc1 %[f11], 0(%[p2_rdft]) \n\t" + "lwc1 %[f12], 4(%[p2_rdft]) \n\t" + "lwc1 %[f13], 8(%[first]) \n\t" + "lwc1 %[f14], 12(%[first]) \n\t" + "add.s %[f7], %[f8], %[f2] \n\t" + "sub.s %[f8], %[f8], %[f2] \n\t" + "add.s %[f2], %[f6], %[f3] \n\t" + "sub.s %[f6], %[f6], %[f3] \n\t" + "add.s %[f3], %[f0], %[f5] \n\t" + "sub.s %[f0], %[f0], %[f5] \n\t" + "add.s %[f5], %[f1], %[f4] \n\t" + "sub.s %[f1], %[f1], %[f4] \n\t" + "swc1 %[f7], 0(%[a_ptr]) \n\t" + "swc1 %[f2], 4(%[a_ptr]) \n\t" + "mul.s %[f4], %[f9], %[f8] \n\t" +#if defined(MIPS32_R2_LE) + "mul.s %[f8], %[f10], %[f8] \n\t" + "mul.s %[f7], %[f11], %[f0] \n\t" + "mul.s %[f0], %[f12], %[f0] \n\t" + "mul.s %[f2], %[f13], %[f3] \n\t" + "mul.s %[f3], %[f14], %[f3] \n\t" + "nmsub.s %[f4], %[f4], %[f10], %[f6] \n\t" + "madd.s %[f8], %[f8], %[f9], %[f6] \n\t" + "nmsub.s %[f7], %[f7], %[f12], %[f5] \n\t" + "madd.s %[f0], %[f0], %[f11], %[f5] \n\t" + "nmsub.s %[f2], %[f2], %[f14], %[f1] \n\t" + "madd.s %[f3], %[f3], %[f13], %[f1] \n\t" +#else + "mul.s %[f7], %[f10], %[f6] \n\t" + "mul.s %[f6], %[f9], %[f6] \n\t" + "mul.s %[f8], %[f10], %[f8] \n\t" + "mul.s %[f2], %[f11], %[f0] \n\t" + "mul.s %[f11], %[f11], %[f5] \n\t" + "mul.s %[f5], %[f12], %[f5] \n\t" + "mul.s %[f0], %[f12], %[f0] \n\t" + "mul.s %[f12], %[f13], %[f3] \n\t" + "mul.s %[f13], %[f13], %[f1] \n\t" + "mul.s %[f1], %[f14], %[f1] \n\t" + "mul.s %[f3], %[f14], %[f3] \n\t" + "sub.s %[f4], %[f4], %[f7] \n\t" + "add.s %[f8], %[f6], %[f8] \n\t" + "sub.s %[f7], %[f2], %[f5] \n\t" + "add.s %[f0], %[f11], %[f0] \n\t" + "sub.s %[f2], %[f12], %[f1] \n\t" + "add.s %[f3], %[f13], %[f3] \n\t" +#endif + "swc1 %[f4], 16(%[a_ptr]) \n\t" + "swc1 %[f8], 20(%[a_ptr]) \n\t" + "swc1 %[f7], 8(%[a_ptr]) \n\t" + "swc1 %[f0], 12(%[a_ptr]) \n\t" + "swc1 %[f2], 24(%[a_ptr]) \n\t" + "swc1 %[f3], 28(%[a_ptr]) \n\t" + "lwc1 %[f0], 32(%[a_ptr]) \n\t" + "lwc1 %[f1], 36(%[a_ptr]) \n\t" + "lwc1 %[f2], 40(%[a_ptr]) \n\t" + "lwc1 %[f3], 44(%[a_ptr]) \n\t" + "lwc1 %[f4], 48(%[a_ptr]) \n\t" + "lwc1 %[f5], 52(%[a_ptr]) \n\t" + "lwc1 %[f6], 56(%[a_ptr]) \n\t" + "lwc1 %[f7], 60(%[a_ptr]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "lwc1 %[f11], 8(%[p2_rdft]) \n\t" + "lwc1 %[f12], 12(%[p2_rdft]) \n\t" + "lwc1 %[f13], 8(%[second]) \n\t" + "lwc1 %[f14], 12(%[second]) \n\t" + "add.s %[f7], %[f8], %[f2] \n\t" + "sub.s %[f8], %[f2], %[f8] \n\t" + "add.s %[f2], %[f6], %[f3] \n\t" + "sub.s %[f6], %[f3], %[f6] \n\t" + "add.s %[f3], %[f0], %[f5] \n\t" + "sub.s %[f0], %[f0], %[f5] \n\t" + "add.s %[f5], %[f1], %[f4] \n\t" + "sub.s %[f1], %[f1], %[f4] \n\t" + "swc1 %[f7], 32(%[a_ptr]) \n\t" + "swc1 %[f2], 36(%[a_ptr]) \n\t" + "mul.s %[f4], %[f10], %[f8] \n\t" +#if defined(MIPS32_R2_LE) + "mul.s %[f10], %[f10], %[f6] \n\t" + "mul.s %[f7], %[f11], %[f0] \n\t" + "mul.s %[f11], %[f11], %[f5] \n\t" + "mul.s %[f2], %[f13], %[f3] \n\t" + "mul.s %[f13], %[f13], %[f1] \n\t" + "madd.s %[f4], %[f4], %[f9], %[f6] \n\t" + "nmsub.s %[f10], %[f10], %[f9], %[f8] \n\t" + "nmsub.s %[f7], %[f7], %[f12], %[f5] \n\t" + "madd.s %[f11], %[f11], %[f12], %[f0] \n\t" + "nmsub.s %[f2], %[f2], %[f14], %[f1] \n\t" + "madd.s %[f13], %[f13], %[f14], %[f3] \n\t" +#else + "mul.s %[f2], %[f9], %[f6] \n\t" + "mul.s %[f10], %[f10], %[f6] \n\t" + "mul.s %[f9], %[f9], %[f8] \n\t" + "mul.s %[f7], %[f11], %[f0] \n\t" + "mul.s %[f8], %[f12], %[f5] \n\t" + "mul.s %[f11], %[f11], %[f5] \n\t" + "mul.s %[f12], %[f12], %[f0] \n\t" + "mul.s %[f5], %[f13], %[f3] \n\t" + "mul.s %[f0], %[f14], %[f1] \n\t" + "mul.s %[f13], %[f13], %[f1] \n\t" + "mul.s %[f14], %[f14], %[f3] \n\t" + "add.s %[f4], %[f4], %[f2] \n\t" + "sub.s %[f10], %[f10], %[f9] \n\t" + "sub.s %[f7], %[f7], %[f8] \n\t" + "add.s %[f11], %[f11], %[f12] \n\t" + "sub.s %[f2], %[f5], %[f0] \n\t" + "add.s %[f13], %[f13], %[f14] \n\t" +#endif + "swc1 %[f4], 48(%[a_ptr]) \n\t" + "swc1 %[f10], 52(%[a_ptr]) \n\t" + "swc1 %[f7], 40(%[a_ptr]) \n\t" + "swc1 %[f11], 44(%[a_ptr]) \n\t" + "swc1 %[f2], 56(%[a_ptr]) \n\t" + "swc1 %[f13], 60(%[a_ptr]) \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f9], 8(%[p1_rdft]) \n\t" + "addiu %[a_ptr], %[a_ptr], 64 \n\t" + "addiu %[p1_rdft], %[p1_rdft], 8 \n\t" + "addiu %[p2_rdft], %[p2_rdft], 16 \n\t" + "addiu %[first], %[first], 8 \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[second], %[second], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), + [f12] "=&f" (f12), [f13] "=&f" (f13), [f14] "=&f" (f14), + [a_ptr] "=&r" (a_ptr), [p1_rdft] "=&r" (p1_rdft), [first] "+r" (first), + [p2_rdft] "=&r" (p2_rdft), [count] "=&r" (count), [second] "+r" (second) + : [a] "r" (a), [rdft_w] "r" (rdft_w) + : "memory" + ); +} + +void cftmdl_128_mips(float* a) { + float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14; + int tmp_a, count; + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "addiu %[tmp_a], %[a], 0 \n\t" + "addiu %[count], $zero, 4 \n\t" + "1: \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f0], 0(%[tmp_a]) \n\t" + "lwc1 %[f2], 32(%[tmp_a]) \n\t" + "lwc1 %[f4], 64(%[tmp_a]) \n\t" + "lwc1 %[f6], 96(%[tmp_a]) \n\t" + "lwc1 %[f1], 4(%[tmp_a]) \n\t" + "lwc1 %[f3], 36(%[tmp_a]) \n\t" + "lwc1 %[f5], 68(%[tmp_a]) \n\t" + "lwc1 %[f7], 100(%[tmp_a]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "add.s %[f7], %[f8], %[f2] \n\t" + "sub.s %[f8], %[f8], %[f2] \n\t" + "add.s %[f2], %[f1], %[f4] \n\t" + "sub.s %[f1], %[f1], %[f4] \n\t" + "add.s %[f4], %[f6], %[f3] \n\t" + "sub.s %[f6], %[f6], %[f3] \n\t" + "sub.s %[f3], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "swc1 %[f7], 0(%[tmp_a]) \n\t" + "swc1 %[f8], 64(%[tmp_a]) \n\t" + "swc1 %[f2], 36(%[tmp_a]) \n\t" + "swc1 %[f1], 100(%[tmp_a]) \n\t" + "swc1 %[f4], 4(%[tmp_a]) \n\t" + "swc1 %[f6], 68(%[tmp_a]) \n\t" + "swc1 %[f3], 32(%[tmp_a]) \n\t" + "swc1 %[f0], 96(%[tmp_a]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[tmp_a], %[tmp_a], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count) + : [a] "r" (a) + : "memory" + ); + f9 = rdft_w[2]; + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "addiu %[tmp_a], %[a], 128 \n\t" + "addiu %[count], $zero, 4 \n\t" + "1: \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f0], 0(%[tmp_a]) \n\t" + "lwc1 %[f2], 32(%[tmp_a]) \n\t" + "lwc1 %[f5], 68(%[tmp_a]) \n\t" + "lwc1 %[f7], 100(%[tmp_a]) \n\t" + "lwc1 %[f1], 4(%[tmp_a]) \n\t" + "lwc1 %[f3], 36(%[tmp_a]) \n\t" + "lwc1 %[f4], 64(%[tmp_a]) \n\t" + "lwc1 %[f6], 96(%[tmp_a]) \n\t" + "sub.s %[f8], %[f0], %[f2] \n\t" + "add.s %[f0], %[f0], %[f2] \n\t" + "sub.s %[f2], %[f5], %[f7] \n\t" + "add.s %[f5], %[f5], %[f7] \n\t" + "sub.s %[f7], %[f1], %[f3] \n\t" + "add.s %[f1], %[f1], %[f3] \n\t" + "sub.s %[f3], %[f4], %[f6] \n\t" + "add.s %[f4], %[f4], %[f6] \n\t" + "sub.s %[f6], %[f8], %[f2] \n\t" + "add.s %[f8], %[f8], %[f2] \n\t" + "add.s %[f2], %[f5], %[f1] \n\t" + "sub.s %[f5], %[f5], %[f1] \n\t" + "add.s %[f1], %[f3], %[f7] \n\t" + "sub.s %[f3], %[f3], %[f7] \n\t" + "add.s %[f7], %[f0], %[f4] \n\t" + "sub.s %[f0], %[f0], %[f4] \n\t" + "sub.s %[f4], %[f6], %[f1] \n\t" + "add.s %[f6], %[f6], %[f1] \n\t" + "sub.s %[f1], %[f3], %[f8] \n\t" + "add.s %[f3], %[f3], %[f8] \n\t" + "mul.s %[f4], %[f4], %[f9] \n\t" + "mul.s %[f6], %[f6], %[f9] \n\t" + "mul.s %[f1], %[f1], %[f9] \n\t" + "mul.s %[f3], %[f3], %[f9] \n\t" + "swc1 %[f7], 0(%[tmp_a]) \n\t" + "swc1 %[f2], 4(%[tmp_a]) \n\t" + "swc1 %[f5], 64(%[tmp_a]) \n\t" + "swc1 %[f0], 68(%[tmp_a]) \n\t" + "swc1 %[f4], 32(%[tmp_a]) \n\t" + "swc1 %[f6], 36(%[tmp_a]) \n\t" + "swc1 %[f1], 96(%[tmp_a]) \n\t" + "swc1 %[f3], 100(%[tmp_a]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[tmp_a], %[tmp_a], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count) + : [a] "r" (a), [f9] "f" (f9) + : "memory" + ); + f10 = rdft_w[3]; + f11 = rdft_w[4]; + f12 = rdft_w[5]; + f13 = rdft_wk3ri_first[2]; + f14 = rdft_wk3ri_first[3]; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "addiu %[tmp_a], %[a], 256 \n\t" + "addiu %[count], $zero, 4 \n\t" + "1: \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f0], 0(%[tmp_a]) \n\t" + "lwc1 %[f2], 32(%[tmp_a]) \n\t" + "lwc1 %[f4], 64(%[tmp_a]) \n\t" + "lwc1 %[f6], 96(%[tmp_a]) \n\t" + "lwc1 %[f1], 4(%[tmp_a]) \n\t" + "lwc1 %[f3], 36(%[tmp_a]) \n\t" + "lwc1 %[f5], 68(%[tmp_a]) \n\t" + "lwc1 %[f7], 100(%[tmp_a]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "sub.s %[f7], %[f8], %[f2] \n\t" + "add.s %[f8], %[f8], %[f2] \n\t" + "add.s %[f2], %[f1], %[f4] \n\t" + "sub.s %[f1], %[f1], %[f4] \n\t" + "sub.s %[f4], %[f6], %[f3] \n\t" + "add.s %[f6], %[f6], %[f3] \n\t" + "sub.s %[f3], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "swc1 %[f8], 0(%[tmp_a]) \n\t" + "swc1 %[f6], 4(%[tmp_a]) \n\t" + "mul.s %[f5], %[f9], %[f7] \n\t" +#if defined(MIPS32_R2_LE) + "mul.s %[f7], %[f10], %[f7] \n\t" + "mul.s %[f8], %[f11], %[f3] \n\t" + "mul.s %[f3], %[f12], %[f3] \n\t" + "mul.s %[f6], %[f13], %[f0] \n\t" + "mul.s %[f0], %[f14], %[f0] \n\t" + "nmsub.s %[f5], %[f5], %[f10], %[f4] \n\t" + "madd.s %[f7], %[f7], %[f9], %[f4] \n\t" + "nmsub.s %[f8], %[f8], %[f12], %[f2] \n\t" + "madd.s %[f3], %[f3], %[f11], %[f2] \n\t" + "nmsub.s %[f6], %[f6], %[f14], %[f1] \n\t" + "madd.s %[f0], %[f0], %[f13], %[f1] \n\t" + "swc1 %[f5], 64(%[tmp_a]) \n\t" + "swc1 %[f7], 68(%[tmp_a]) \n\t" +#else + "mul.s %[f8], %[f10], %[f4] \n\t" + "mul.s %[f4], %[f9], %[f4] \n\t" + "mul.s %[f7], %[f10], %[f7] \n\t" + "mul.s %[f6], %[f11], %[f3] \n\t" + "mul.s %[f3], %[f12], %[f3] \n\t" + "sub.s %[f5], %[f5], %[f8] \n\t" + "mul.s %[f8], %[f12], %[f2] \n\t" + "mul.s %[f2], %[f11], %[f2] \n\t" + "add.s %[f7], %[f4], %[f7] \n\t" + "mul.s %[f4], %[f13], %[f0] \n\t" + "mul.s %[f0], %[f14], %[f0] \n\t" + "sub.s %[f8], %[f6], %[f8] \n\t" + "mul.s %[f6], %[f14], %[f1] \n\t" + "mul.s %[f1], %[f13], %[f1] \n\t" + "add.s %[f3], %[f2], %[f3] \n\t" + "swc1 %[f5], 64(%[tmp_a]) \n\t" + "swc1 %[f7], 68(%[tmp_a]) \n\t" + "sub.s %[f6], %[f4], %[f6] \n\t" + "add.s %[f0], %[f1], %[f0] \n\t" +#endif + "swc1 %[f8], 32(%[tmp_a]) \n\t" + "swc1 %[f3], 36(%[tmp_a]) \n\t" + "swc1 %[f6], 96(%[tmp_a]) \n\t" + "swc1 %[f0], 100(%[tmp_a]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[tmp_a], %[tmp_a], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count) + : [a] "r" (a), [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11), + [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14) + : "memory" + ); + f11 = rdft_w[6]; + f12 = rdft_w[7]; + f13 = rdft_wk3ri_second[2]; + f14 = rdft_wk3ri_second[3]; + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "addiu %[tmp_a], %[a], 384 \n\t" + "addiu %[count], $zero, 4 \n\t" + "1: \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f0], 0(%[tmp_a]) \n\t" + "lwc1 %[f1], 4(%[tmp_a]) \n\t" + "lwc1 %[f2], 32(%[tmp_a]) \n\t" + "lwc1 %[f3], 36(%[tmp_a]) \n\t" + "lwc1 %[f4], 64(%[tmp_a]) \n\t" + "lwc1 %[f5], 68(%[tmp_a]) \n\t" + "lwc1 %[f6], 96(%[tmp_a]) \n\t" + "lwc1 %[f7], 100(%[tmp_a]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "sub.s %[f7], %[f2], %[f8] \n\t" + "add.s %[f2], %[f2], %[f8] \n\t" + "add.s %[f8], %[f1], %[f4] \n\t" + "sub.s %[f1], %[f1], %[f4] \n\t" + "sub.s %[f4], %[f3], %[f6] \n\t" + "add.s %[f3], %[f3], %[f6] \n\t" + "sub.s %[f6], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "swc1 %[f2], 0(%[tmp_a]) \n\t" + "swc1 %[f3], 4(%[tmp_a]) \n\t" + "mul.s %[f5], %[f10], %[f7] \n\t" +#if defined(MIPS32_R2_LE) + "mul.s %[f7], %[f9], %[f7] \n\t" + "mul.s %[f2], %[f12], %[f8] \n\t" + "mul.s %[f8], %[f11], %[f8] \n\t" + "mul.s %[f3], %[f14], %[f1] \n\t" + "mul.s %[f1], %[f13], %[f1] \n\t" + "madd.s %[f5], %[f5], %[f9], %[f4] \n\t" + "msub.s %[f7], %[f7], %[f10], %[f4] \n\t" + "msub.s %[f2], %[f2], %[f11], %[f6] \n\t" + "madd.s %[f8], %[f8], %[f12], %[f6] \n\t" + "msub.s %[f3], %[f3], %[f13], %[f0] \n\t" + "madd.s %[f1], %[f1], %[f14], %[f0] \n\t" + "swc1 %[f5], 64(%[tmp_a]) \n\t" + "swc1 %[f7], 68(%[tmp_a]) \n\t" +#else + "mul.s %[f2], %[f9], %[f4] \n\t" + "mul.s %[f4], %[f10], %[f4] \n\t" + "mul.s %[f7], %[f9], %[f7] \n\t" + "mul.s %[f3], %[f11], %[f6] \n\t" + "mul.s %[f6], %[f12], %[f6] \n\t" + "add.s %[f5], %[f5], %[f2] \n\t" + "sub.s %[f7], %[f4], %[f7] \n\t" + "mul.s %[f2], %[f12], %[f8] \n\t" + "mul.s %[f8], %[f11], %[f8] \n\t" + "mul.s %[f4], %[f14], %[f1] \n\t" + "mul.s %[f1], %[f13], %[f1] \n\t" + "sub.s %[f2], %[f3], %[f2] \n\t" + "mul.s %[f3], %[f13], %[f0] \n\t" + "mul.s %[f0], %[f14], %[f0] \n\t" + "add.s %[f8], %[f8], %[f6] \n\t" + "swc1 %[f5], 64(%[tmp_a]) \n\t" + "swc1 %[f7], 68(%[tmp_a]) \n\t" + "sub.s %[f3], %[f3], %[f4] \n\t" + "add.s %[f1], %[f1], %[f0] \n\t" +#endif + "swc1 %[f2], 32(%[tmp_a]) \n\t" + "swc1 %[f8], 36(%[tmp_a]) \n\t" + "swc1 %[f3], 96(%[tmp_a]) \n\t" + "swc1 %[f1], 100(%[tmp_a]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[tmp_a], %[tmp_a], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count) + : [a] "r" (a), [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11), + [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14) + : "memory" + ); +} + +void cftfsub_128_mips(float* a) { + float f0, f1, f2, f3, f4, f5, f6, f7, f8; + int tmp_a, count; + + cft1st_128_mips(a); + cftmdl_128_mips(a); + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "addiu %[tmp_a], %[a], 0 \n\t" + "addiu %[count], $zero, 16 \n\t" + "1: \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f0], 0(%[tmp_a]) \n\t" + "lwc1 %[f2], 128(%[tmp_a]) \n\t" + "lwc1 %[f4], 256(%[tmp_a]) \n\t" + "lwc1 %[f6], 384(%[tmp_a]) \n\t" + "lwc1 %[f1], 4(%[tmp_a]) \n\t" + "lwc1 %[f3], 132(%[tmp_a]) \n\t" + "lwc1 %[f5], 260(%[tmp_a]) \n\t" + "lwc1 %[f7], 388(%[tmp_a]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f1], %[f3] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "add.s %[f7], %[f8], %[f2] \n\t" + "sub.s %[f8], %[f8], %[f2] \n\t" + "add.s %[f2], %[f1], %[f4] \n\t" + "sub.s %[f1], %[f1], %[f4] \n\t" + "add.s %[f4], %[f6], %[f3] \n\t" + "sub.s %[f6], %[f6], %[f3] \n\t" + "sub.s %[f3], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "swc1 %[f7], 0(%[tmp_a]) \n\t" + "swc1 %[f8], 256(%[tmp_a]) \n\t" + "swc1 %[f2], 132(%[tmp_a]) \n\t" + "swc1 %[f1], 388(%[tmp_a]) \n\t" + "swc1 %[f4], 4(%[tmp_a]) \n\t" + "swc1 %[f6], 260(%[tmp_a]) \n\t" + "swc1 %[f3], 128(%[tmp_a]) \n\t" + "swc1 %[f0], 384(%[tmp_a]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[tmp_a], %[tmp_a], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), + [count] "=&r" (count) + : [a] "r" (a) + : "memory" + ); +} + +void cftbsub_128_mips(float* a) { + float f0, f1, f2, f3, f4, f5, f6, f7, f8; + int tmp_a, count; + + cft1st_128_mips(a); + cftmdl_128_mips(a); + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "addiu %[tmp_a], %[a], 0 \n\t" + "addiu %[count], $zero, 16 \n\t" + "1: \n\t" + "addiu %[count], %[count], -1 \n\t" + "lwc1 %[f0], 0(%[tmp_a]) \n\t" + "lwc1 %[f2], 128(%[tmp_a]) \n\t" + "lwc1 %[f4], 256(%[tmp_a]) \n\t" + "lwc1 %[f6], 384(%[tmp_a]) \n\t" + "lwc1 %[f1], 4(%[tmp_a]) \n\t" + "lwc1 %[f3], 132(%[tmp_a]) \n\t" + "lwc1 %[f5], 260(%[tmp_a]) \n\t" + "lwc1 %[f7], 388(%[tmp_a]) \n\t" + "add.s %[f8], %[f0], %[f2] \n\t" + "sub.s %[f0], %[f0], %[f2] \n\t" + "add.s %[f2], %[f4], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "add.s %[f6], %[f1], %[f3] \n\t" + "sub.s %[f1], %[f3], %[f1] \n\t" + "add.s %[f3], %[f5], %[f7] \n\t" + "sub.s %[f5], %[f5], %[f7] \n\t" + "add.s %[f7], %[f8], %[f2] \n\t" + "sub.s %[f8], %[f8], %[f2] \n\t" + "sub.s %[f2], %[f1], %[f4] \n\t" + "add.s %[f1], %[f1], %[f4] \n\t" + "add.s %[f4], %[f3], %[f6] \n\t" + "sub.s %[f6], %[f3], %[f6] \n\t" + "sub.s %[f3], %[f0], %[f5] \n\t" + "add.s %[f0], %[f0], %[f5] \n\t" + "neg.s %[f4], %[f4] \n\t" + "swc1 %[f7], 0(%[tmp_a]) \n\t" + "swc1 %[f8], 256(%[tmp_a]) \n\t" + "swc1 %[f2], 132(%[tmp_a]) \n\t" + "swc1 %[f1], 388(%[tmp_a]) \n\t" + "swc1 %[f6], 260(%[tmp_a]) \n\t" + "swc1 %[f3], 128(%[tmp_a]) \n\t" + "swc1 %[f0], 384(%[tmp_a]) \n\t" + "swc1 %[f4], 4(%[tmp_a]) \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[tmp_a], %[tmp_a], 8 \n\t" + ".set pop \n\t" + : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), + [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), + [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count) + : [a] "r" (a) + : "memory" + ); +} + +void rftfsub_128_mips(float* a) { + const float* c = rdft_w + 32; + const float f0 = 0.5f; + float* a1 = &a[2]; + float* a2 = &a[126]; + const float* c1 = &c[1]; + const float* c2 = &c[31]; + float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; + int count; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "lwc1 %[f6], 0(%[c2]) \n\t" + "lwc1 %[f1], 0(%[a1]) \n\t" + "lwc1 %[f2], 0(%[a2]) \n\t" + "lwc1 %[f3], 4(%[a1]) \n\t" + "lwc1 %[f4], 4(%[a2]) \n\t" + "lwc1 %[f5], 0(%[c1]) \n\t" + "sub.s %[f6], %[f0], %[f6] \n\t" + "sub.s %[f7], %[f1], %[f2] \n\t" + "add.s %[f8], %[f3], %[f4] \n\t" + "addiu %[count], $zero, 15 \n\t" + "mul.s %[f9], %[f6], %[f7] \n\t" + "mul.s %[f6], %[f6], %[f8] \n\t" +#if !defined(MIPS32_R2_LE) + "mul.s %[f8], %[f5], %[f8] \n\t" + "mul.s %[f5], %[f5], %[f7] \n\t" + "sub.s %[f9], %[f9], %[f8] \n\t" + "add.s %[f6], %[f6], %[f5] \n\t" +#else + "nmsub.s %[f9], %[f9], %[f5], %[f8] \n\t" + "madd.s %[f6], %[f6], %[f5], %[f7] \n\t" +#endif + "sub.s %[f1], %[f1], %[f9] \n\t" + "add.s %[f2], %[f2], %[f9] \n\t" + "sub.s %[f3], %[f3], %[f6] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "swc1 %[f1], 0(%[a1]) \n\t" + "swc1 %[f2], 0(%[a2]) \n\t" + "swc1 %[f3], 4(%[a1]) \n\t" + "swc1 %[f4], 4(%[a2]) \n\t" + "addiu %[a1], %[a1], 8 \n\t" + "addiu %[a2], %[a2], -8 \n\t" + "addiu %[c1], %[c1], 4 \n\t" + "addiu %[c2], %[c2], -4 \n\t" + "1: \n\t" + "lwc1 %[f6], 0(%[c2]) \n\t" + "lwc1 %[f1], 0(%[a1]) \n\t" + "lwc1 %[f2], 0(%[a2]) \n\t" + "lwc1 %[f3], 4(%[a1]) \n\t" + "lwc1 %[f4], 4(%[a2]) \n\t" + "lwc1 %[f5], 0(%[c1]) \n\t" + "sub.s %[f6], %[f0], %[f6] \n\t" + "sub.s %[f7], %[f1], %[f2] \n\t" + "add.s %[f8], %[f3], %[f4] \n\t" + "lwc1 %[f10], -4(%[c2]) \n\t" + "lwc1 %[f11], 8(%[a1]) \n\t" + "lwc1 %[f12], -8(%[a2]) \n\t" + "mul.s %[f9], %[f6], %[f7] \n\t" + "mul.s %[f6], %[f6], %[f8] \n\t" +#if !defined(MIPS32_R2_LE) + "mul.s %[f8], %[f5], %[f8] \n\t" + "mul.s %[f5], %[f5], %[f7] \n\t" + "lwc1 %[f13], 12(%[a1]) \n\t" + "lwc1 %[f14], -4(%[a2]) \n\t" + "lwc1 %[f15], 4(%[c1]) \n\t" + "sub.s %[f9], %[f9], %[f8] \n\t" + "add.s %[f6], %[f6], %[f5] \n\t" +#else + "lwc1 %[f13], 12(%[a1]) \n\t" + "lwc1 %[f14], -4(%[a2]) \n\t" + "lwc1 %[f15], 4(%[c1]) \n\t" + "nmsub.s %[f9], %[f9], %[f5], %[f8] \n\t" + "madd.s %[f6], %[f6], %[f5], %[f7] \n\t" +#endif + "sub.s %[f10], %[f0], %[f10] \n\t" + "sub.s %[f5], %[f11], %[f12] \n\t" + "add.s %[f7], %[f13], %[f14] \n\t" + "sub.s %[f1], %[f1], %[f9] \n\t" + "add.s %[f2], %[f2], %[f9] \n\t" + "sub.s %[f3], %[f3], %[f6] \n\t" + "mul.s %[f8], %[f10], %[f5] \n\t" + "mul.s %[f10], %[f10], %[f7] \n\t" +#if !defined(MIPS32_R2_LE) + "mul.s %[f9], %[f15], %[f7] \n\t" + "mul.s %[f15], %[f15], %[f5] \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "swc1 %[f1], 0(%[a1]) \n\t" + "swc1 %[f2], 0(%[a2]) \n\t" + "sub.s %[f8], %[f8], %[f9] \n\t" + "add.s %[f10], %[f10], %[f15] \n\t" +#else + "swc1 %[f1], 0(%[a1]) \n\t" + "swc1 %[f2], 0(%[a2]) \n\t" + "sub.s %[f4], %[f4], %[f6] \n\t" + "nmsub.s %[f8], %[f8], %[f15], %[f7] \n\t" + "madd.s %[f10], %[f10], %[f15], %[f5] \n\t" +#endif + "swc1 %[f3], 4(%[a1]) \n\t" + "swc1 %[f4], 4(%[a2]) \n\t" + "sub.s %[f11], %[f11], %[f8] \n\t" + "add.s %[f12], %[f12], %[f8] \n\t" + "sub.s %[f13], %[f13], %[f10] \n\t" + "sub.s %[f14], %[f14], %[f10] \n\t" + "addiu %[c2], %[c2], -8 \n\t" + "addiu %[c1], %[c1], 8 \n\t" + "swc1 %[f11], 8(%[a1]) \n\t" + "swc1 %[f12], -8(%[a2]) \n\t" + "swc1 %[f13], 12(%[a1]) \n\t" + "swc1 %[f14], -4(%[a2]) \n\t" + "addiu %[a1], %[a1], 16 \n\t" + "addiu %[count], %[count], -1 \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[a2], %[a2], -16 \n\t" + ".set pop \n\t" + : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2), + [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4), + [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8), + [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12), + [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15), + [count] "=&r" (count) + : [f0] "f" (f0) + : "memory" + ); +} + +void rftbsub_128_mips(float* a) { + const float *c = rdft_w + 32; + const float f0 = 0.5f; + float* a1 = &a[2]; + float* a2 = &a[126]; + const float* c1 = &c[1]; + const float* c2 = &c[31]; + float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; + int count; + + a[1] = -a[1]; + a[65] = -a[65]; + + __asm __volatile ( + ".set push \n\t" + ".set noreorder \n\t" + "lwc1 %[f6], 0(%[c2]) \n\t" + "lwc1 %[f1], 0(%[a1]) \n\t" + "lwc1 %[f2], 0(%[a2]) \n\t" + "lwc1 %[f3], 4(%[a1]) \n\t" + "lwc1 %[f4], 4(%[a2]) \n\t" + "lwc1 %[f5], 0(%[c1]) \n\t" + "sub.s %[f6], %[f0], %[f6] \n\t" + "sub.s %[f7], %[f1], %[f2] \n\t" + "add.s %[f8], %[f3], %[f4] \n\t" + "addiu %[count], $zero, 15 \n\t" + "mul.s %[f9], %[f6], %[f7] \n\t" + "mul.s %[f6], %[f6], %[f8] \n\t" +#if !defined(MIPS32_R2_LE) + "mul.s %[f8], %[f5], %[f8] \n\t" + "mul.s %[f5], %[f5], %[f7] \n\t" + "add.s %[f9], %[f9], %[f8] \n\t" + "sub.s %[f6], %[f6], %[f5] \n\t" +#else + "madd.s %[f9], %[f9], %[f5], %[f8] \n\t" + "nmsub.s %[f6], %[f6], %[f5], %[f7] \n\t" +#endif + "sub.s %[f1], %[f1], %[f9] \n\t" + "add.s %[f2], %[f2], %[f9] \n\t" + "sub.s %[f3], %[f6], %[f3] \n\t" + "sub.s %[f4], %[f6], %[f4] \n\t" + "swc1 %[f1], 0(%[a1]) \n\t" + "swc1 %[f2], 0(%[a2]) \n\t" + "swc1 %[f3], 4(%[a1]) \n\t" + "swc1 %[f4], 4(%[a2]) \n\t" + "addiu %[a1], %[a1], 8 \n\t" + "addiu %[a2], %[a2], -8 \n\t" + "addiu %[c1], %[c1], 4 \n\t" + "addiu %[c2], %[c2], -4 \n\t" + "1: \n\t" + "lwc1 %[f6], 0(%[c2]) \n\t" + "lwc1 %[f1], 0(%[a1]) \n\t" + "lwc1 %[f2], 0(%[a2]) \n\t" + "lwc1 %[f3], 4(%[a1]) \n\t" + "lwc1 %[f4], 4(%[a2]) \n\t" + "lwc1 %[f5], 0(%[c1]) \n\t" + "sub.s %[f6], %[f0], %[f6] \n\t" + "sub.s %[f7], %[f1], %[f2] \n\t" + "add.s %[f8], %[f3], %[f4] \n\t" + "lwc1 %[f10], -4(%[c2]) \n\t" + "lwc1 %[f11], 8(%[a1]) \n\t" + "lwc1 %[f12], -8(%[a2]) \n\t" + "mul.s %[f9], %[f6], %[f7] \n\t" + "mul.s %[f6], %[f6], %[f8] \n\t" +#if !defined(MIPS32_R2_LE) + "mul.s %[f8], %[f5], %[f8] \n\t" + "mul.s %[f5], %[f5], %[f7] \n\t" + "lwc1 %[f13], 12(%[a1]) \n\t" + "lwc1 %[f14], -4(%[a2]) \n\t" + "lwc1 %[f15], 4(%[c1]) \n\t" + "add.s %[f9], %[f9], %[f8] \n\t" + "sub.s %[f6], %[f6], %[f5] \n\t" +#else + "lwc1 %[f13], 12(%[a1]) \n\t" + "lwc1 %[f14], -4(%[a2]) \n\t" + "lwc1 %[f15], 4(%[c1]) \n\t" + "madd.s %[f9], %[f9], %[f5], %[f8] \n\t" + "nmsub.s %[f6], %[f6], %[f5], %[f7] \n\t" +#endif + "sub.s %[f10], %[f0], %[f10] \n\t" + "sub.s %[f5], %[f11], %[f12] \n\t" + "add.s %[f7], %[f13], %[f14] \n\t" + "sub.s %[f1], %[f1], %[f9] \n\t" + "add.s %[f2], %[f2], %[f9] \n\t" + "sub.s %[f3], %[f6], %[f3] \n\t" + "mul.s %[f8], %[f10], %[f5] \n\t" + "mul.s %[f10], %[f10], %[f7] \n\t" +#if !defined(MIPS32_R2_LE) + "mul.s %[f9], %[f15], %[f7] \n\t" + "mul.s %[f15], %[f15], %[f5] \n\t" + "sub.s %[f4], %[f6], %[f4] \n\t" + "swc1 %[f1], 0(%[a1]) \n\t" + "swc1 %[f2], 0(%[a2]) \n\t" + "add.s %[f8], %[f8], %[f9] \n\t" + "sub.s %[f10], %[f10], %[f15] \n\t" +#else + "swc1 %[f1], 0(%[a1]) \n\t" + "swc1 %[f2], 0(%[a2]) \n\t" + "sub.s %[f4], %[f6], %[f4] \n\t" + "madd.s %[f8], %[f8], %[f15], %[f7] \n\t" + "nmsub.s %[f10], %[f10], %[f15], %[f5] \n\t" +#endif + "swc1 %[f3], 4(%[a1]) \n\t" + "swc1 %[f4], 4(%[a2]) \n\t" + "sub.s %[f11], %[f11], %[f8] \n\t" + "add.s %[f12], %[f12], %[f8] \n\t" + "sub.s %[f13], %[f10], %[f13] \n\t" + "sub.s %[f14], %[f10], %[f14] \n\t" + "addiu %[c2], %[c2], -8 \n\t" + "addiu %[c1], %[c1], 8 \n\t" + "swc1 %[f11], 8(%[a1]) \n\t" + "swc1 %[f12], -8(%[a2]) \n\t" + "swc1 %[f13], 12(%[a1]) \n\t" + "swc1 %[f14], -4(%[a2]) \n\t" + "addiu %[a1], %[a1], 16 \n\t" + "addiu %[count], %[count], -1 \n\t" + "bgtz %[count], 1b \n\t" + " addiu %[a2], %[a2], -16 \n\t" + ".set pop \n\t" + : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2), + [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4), + [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8), + [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12), + [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15), + [count] "=&r" (count) + : [f0] "f" (f0) + : "memory" + ); +} +#endif + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_neon.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_neon.cc new file mode 100644 index 0000000000..401387a643 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_neon.cc @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * The rdft AEC algorithm, neon version of speed-critical functions. + * + * Based on the sse2 version. + */ + +#include "modules/audio_processing/utility/ooura_fft.h" + +#include <arm_neon.h> + +#include "modules/audio_processing/utility/ooura_fft_tables_common.h" +#include "modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" + +namespace webrtc { + +#if defined(WEBRTC_HAS_NEON) +void cft1st_128_neon(float* a) { + const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); + int j, k2; + + for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { + float32x4_t a00v = vld1q_f32(&a[j + 0]); + float32x4_t a04v = vld1q_f32(&a[j + 4]); + float32x4_t a08v = vld1q_f32(&a[j + 8]); + float32x4_t a12v = vld1q_f32(&a[j + 12]); + float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); + float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); + float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); + float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v)); + const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]); + const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]); + const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]); + const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]); + const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]); + const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]); + float32x4_t x0v = vaddq_f32(a01v, a23v); + const float32x4_t x1v = vsubq_f32(a01v, a23v); + const float32x4_t x2v = vaddq_f32(a45v, a67v); + const float32x4_t x3v = vsubq_f32(a45v, a67v); + const float32x4_t x3w = vrev64q_f32(x3v); + float32x4_t x0w; + a01v = vaddq_f32(x0v, x2v); + x0v = vsubq_f32(x0v, x2v); + x0w = vrev64q_f32(x0v); + a45v = vmulq_f32(wk2rv, x0v); + a45v = vmlaq_f32(a45v, wk2iv, x0w); + x0v = vmlaq_f32(x1v, x3w, vec_swap_sign); + x0w = vrev64q_f32(x0v); + a23v = vmulq_f32(wk1rv, x0v); + a23v = vmlaq_f32(a23v, wk1iv, x0w); + x0v = vmlsq_f32(x1v, x3w, vec_swap_sign); + x0w = vrev64q_f32(x0v); + a67v = vmulq_f32(wk3rv, x0v); + a67v = vmlaq_f32(a67v, wk3iv, x0w); + a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); + a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); + a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); + a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); + vst1q_f32(&a[j + 0], a00v); + vst1q_f32(&a[j + 4], a04v); + vst1q_f32(&a[j + 8], a08v); + vst1q_f32(&a[j + 12], a12v); + } +} + +void cftmdl_128_neon(float* a) { + int j; + const int l = 8; + const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); + float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); + + for (j = 0; j < l; j += 2) { + const float32x2_t a_00 = vld1_f32(&a[j + 0]); + const float32x2_t a_08 = vld1_f32(&a[j + 8]); + const float32x2_t a_32 = vld1_f32(&a[j + 32]); + const float32x2_t a_40 = vld1_f32(&a[j + 40]); + const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); + const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); + const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); + const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); + const float32x2_t a_16 = vld1_f32(&a[j + 16]); + const float32x2_t a_24 = vld1_f32(&a[j + 24]); + const float32x2_t a_48 = vld1_f32(&a[j + 48]); + const float32x2_t a_56 = vld1_f32(&a[j + 56]); + const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); + const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); + const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); + const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); + const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); + const float32x4_t x1_x3_add = + vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + const float32x4_t x1_x3_sub = + vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); + const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); + const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); + const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); + const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); + const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); + const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); + const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); + const float32x4_t xx1_rev = vrev64q_f32(xx1); + const float32x4_t yy4_rev = vrev64q_f32(yy4); + + vst1_f32(&a[j + 0], vget_low_f32(xx0)); + vst1_f32(&a[j + 32], vget_high_f32(xx0)); + vst1_f32(&a[j + 16], vget_low_f32(xx1)); + vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); + + a[j + 48] = -a[j + 48]; + + vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); + vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); + vst1_f32(&a[j + 40], vget_low_f32(yy4)); + vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); + } + + { + const int k = 64; + const int k1 = 2; + const int k2 = 2 * k1; + const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); + const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); + const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); + const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); + const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); + wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); + for (j = k; j < l + k; j += 2) { + const float32x2_t a_00 = vld1_f32(&a[j + 0]); + const float32x2_t a_08 = vld1_f32(&a[j + 8]); + const float32x2_t a_32 = vld1_f32(&a[j + 32]); + const float32x2_t a_40 = vld1_f32(&a[j + 40]); + const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); + const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); + const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); + const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); + const float32x2_t a_16 = vld1_f32(&a[j + 16]); + const float32x2_t a_24 = vld1_f32(&a[j + 24]); + const float32x2_t a_48 = vld1_f32(&a[j + 48]); + const float32x2_t a_56 = vld1_f32(&a[j + 56]); + const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); + const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); + const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); + const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); + const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); + const float32x4_t x1_x3_add = + vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + const float32x4_t x1_x3_sub = + vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + float32x4_t xx4 = vmulq_f32(wk2rv, xx1); + float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); + float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); + xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); + xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); + xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); + + vst1_f32(&a[j + 0], vget_low_f32(xx)); + vst1_f32(&a[j + 32], vget_high_f32(xx)); + vst1_f32(&a[j + 16], vget_low_f32(xx4)); + vst1_f32(&a[j + 48], vget_high_f32(xx4)); + vst1_f32(&a[j + 8], vget_low_f32(xx12)); + vst1_f32(&a[j + 40], vget_high_f32(xx12)); + vst1_f32(&a[j + 24], vget_low_f32(xx22)); + vst1_f32(&a[j + 56], vget_high_f32(xx22)); + } + } +} + +__inline static float32x4_t reverse_order_f32x4(float32x4_t in) { + // A B C D -> C D A B + const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in)); + // C D A B -> D C B A + return vrev64q_f32(rev); +} + +void rftfsub_128_neon(float* a) { + const float* c = rdft_w + 32; + int j1, j2; + const float32x4_t mm_half = vdupq_n_f32(0.5f); + + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, + const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, + const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, + const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, + const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + // 2, 4, 6, 8, 3, 5, 7, 9 + float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); + // 120, 122, 124, 126, 121, 123, 125, 127, + const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); + // 126, 124, 122, 120 + const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); + // 127, 125, 123, 121 + const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); + // Calculate 'x'. + const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr - wki * xi; + // yi = wkr * xi + wki * xr; + const float32x4_t a_ = vmulq_f32(wkr_, xr_); + const float32x4_t b_ = vmulq_f32(wki_, xi_); + const float32x4_t c_ = vmulq_f32(wkr_, xi_); + const float32x4_t d_ = vmulq_f32(wki_, xr_); + const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] -= yr; + // a[j2 + 1] -= yi; + // a[k2 + 0] += yr; + // a[k2 + 1] -= yi; + // 126, 124, 122, 120, + const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); + // 127, 125, 123, 121, + const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_); + // Shuffle in right order and store. + const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); + const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); + // 124, 125, 126, 127, 120, 121, 122, 123 + const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); + // 2, 4, 6, 8, + a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); + // 3, 5, 7, 9, + a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_); + // 2, 3, 4, 5, 6, 7, 8, 9, + vst2q_f32(&a[0 + j2], a_j2_p); + + vst1q_f32(&a[122 - j2], a_k2_n.val[1]); + vst1q_f32(&a[126 - j2], a_k2_n.val[0]); + } + + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + const int k2 = 128 - j2; + const int k1 = 32 - j1; + const float wkr = 0.5f - c[k1]; + const float wki = c[j1]; + const float xr = a[j2 + 0] - a[k2 + 0]; + const float xi = a[j2 + 1] + a[k2 + 1]; + const float yr = wkr * xr - wki * xi; + const float yi = wkr * xi + wki * xr; + a[j2 + 0] -= yr; + a[j2 + 1] -= yi; + a[k2 + 0] += yr; + a[k2 + 1] -= yi; + } +} + +void rftbsub_128_neon(float* a) { + const float* c = rdft_w + 32; + int j1, j2; + const float32x4_t mm_half = vdupq_n_f32(0.5f); + + a[1] = -a[1]; + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, + const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, + const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, + const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, + const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + // 2, 4, 6, 8, 3, 5, 7, 9 + float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); + // 120, 122, 124, 126, 121, 123, 125, 127, + const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); + // 126, 124, 122, 120 + const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); + // 127, 125, 123, 121 + const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); + // Calculate 'x'. + const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr - wki * xi; + // yi = wkr * xi + wki * xr; + const float32x4_t a_ = vmulq_f32(wkr_, xr_); + const float32x4_t b_ = vmulq_f32(wki_, xi_); + const float32x4_t c_ = vmulq_f32(wkr_, xi_); + const float32x4_t d_ = vmulq_f32(wki_, xr_); + const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] -= yr; + // a[j2 + 1] -= yi; + // a[k2 + 0] += yr; + // a[k2 + 1] -= yi; + // 126, 124, 122, 120, + const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); + // 127, 125, 123, 121, + const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1); + // Shuffle in right order and store. + // 2, 3, 4, 5, 6, 7, 8, 9, + const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); + const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); + // 124, 125, 126, 127, 120, 121, 122, 123 + const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); + // 2, 4, 6, 8, + a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); + // 3, 5, 7, 9, + a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]); + // 2, 3, 4, 5, 6, 7, 8, 9, + vst2q_f32(&a[0 + j2], a_j2_p); + + vst1q_f32(&a[122 - j2], a_k2_n.val[1]); + vst1q_f32(&a[126 - j2], a_k2_n.val[0]); + } + + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + const int k2 = 128 - j2; + const int k1 = 32 - j1; + const float wkr = 0.5f - c[k1]; + const float wki = c[j1]; + const float xr = a[j2 + 0] - a[k2 + 0]; + const float xi = a[j2 + 1] + a[k2 + 1]; + const float yr = wkr * xr + wki * xi; + const float yi = wkr * xi - wki * xr; + a[j2 + 0] = a[j2 + 0] - yr; + a[j2 + 1] = yi - a[j2 + 1]; + a[k2 + 0] = yr + a[k2 + 0]; + a[k2 + 1] = yi - a[k2 + 1]; + } + a[65] = -a[65]; +} +#endif + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc new file mode 100644 index 0000000000..48a05c3bc2 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_sse2.cc @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing//utility/ooura_fft.h" + +#include <emmintrin.h> + +#include "modules/audio_processing/utility/ooura_fft_tables_common.h" +#include "modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h" + +namespace webrtc { + +#if defined(WEBRTC_ARCH_X86_FAMILY) + +namespace { +// These intrinsics were unavailable before VS 2008. +// TODO(andrew): move to a common file. +#if defined(_MSC_VER) && _MSC_VER < 1500 +static __inline __m128 _mm_castsi128_ps(__m128i a) { + return *(__m128*)&a; +} +static __inline __m128i _mm_castps_si128(__m128 a) { + return *(__m128i*)&a; +} +#endif + +} // namespace + +void cft1st_128_SSE2(float* a) { + const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); + int j, k2; + + for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { + __m128 a00v = _mm_loadu_ps(&a[j + 0]); + __m128 a04v = _mm_loadu_ps(&a[j + 4]); + __m128 a08v = _mm_loadu_ps(&a[j + 8]); + __m128 a12v = _mm_loadu_ps(&a[j + 12]); + __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); + __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); + __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0)); + __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2)); + + const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]); + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]); + __m128 x0v = _mm_add_ps(a01v, a23v); + const __m128 x1v = _mm_sub_ps(a01v, a23v); + const __m128 x2v = _mm_add_ps(a45v, a67v); + const __m128 x3v = _mm_sub_ps(a45v, a67v); + __m128 x0w; + a01v = _mm_add_ps(x0v, x2v); + x0v = _mm_sub_ps(x0v, x2v); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); + { + const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); + const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); + a45v = _mm_add_ps(a45_0v, a45_1v); + } + { + __m128 a23_0v, a23_1v; + const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); + x0v = _mm_add_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); + a23_0v = _mm_mul_ps(wk1rv, x0v); + a23_1v = _mm_mul_ps(wk1iv, x0w); + a23v = _mm_add_ps(a23_0v, a23_1v); + + x0v = _mm_sub_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); + } + { + const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); + const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); + a67v = _mm_add_ps(a67_0v, a67_1v); + } + + a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0)); + a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); + a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); + a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); + _mm_storeu_ps(&a[j + 0], a00v); + _mm_storeu_ps(&a[j + 4], a04v); + _mm_storeu_ps(&a[j + 8], a08v); + _mm_storeu_ps(&a[j + 12], a12v); + } +} + +void cftmdl_128_SSE2(float* a) { + const int l = 8; + const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); + int j0; + + __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); + for (j0 = 0; j0 < l; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = + _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_08_40 = + _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), + _MM_SHUFFLE(1, 0, 1, 0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = + _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_24_56 = + _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + + const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + + const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( + _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + + const __m128 yy0 = + _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2)); + const __m128 yy1 = + _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3)); + const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); + const __m128 yy3 = _mm_add_ps(yy0, yy2); + const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); + + _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 32], + _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 48], + _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3))); + a[j0 + 48] = -a[j0 + 48]; + + _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); + _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); + + _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 56], + _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3))); + } + + { + int k = 64; + int k1 = 2; + int k2 = 2 * k1; + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); + wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); + for (j0 = k; j0 < l + k; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = + _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_08_40 = + _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), + _MM_SHUFFLE(1, 0, 1, 0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = + _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 a_24_56 = + _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), + _MM_SHUFFLE(1, 0, 1, 0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + + const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); + const __m128 xx3 = _mm_mul_ps( + wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx4 = _mm_add_ps(xx2, xx3); + + const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( + _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + + const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); + const __m128 xx11 = _mm_mul_ps( + wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx12 = _mm_add_ps(xx10, xx11); + + const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); + const __m128 xx21 = _mm_mul_ps( + wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx22 = _mm_add_ps(xx20, xx21); + + _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 32], + _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 48], + _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 40], + _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); + _mm_storel_epi64( + (__m128i*)&a[j0 + 56], + _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); + } + } +} + +void rftfsub_128_SSE2(float* a) { + const float* c = rdft_w + 32; + int j1, j2, k1, k2; + float wkr, wki, xr, xi, yr, yi; + + static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, + 0.5f}; + const __m128 mm_half = _mm_load_ps(k_half); + + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, + const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, + const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, + const __m128 wkr_ = + _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, + const __m128 wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, + const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, + const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, + const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, + const __m128 a_j2_p0 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8, + const __m128 a_j2_p1 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9, + const __m128 a_k2_p0 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120, + const __m128 a_k2_p1 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121, + // Calculate 'x'. + const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr - wki * xi; + // yi = wkr * xi + wki * xr; + const __m128 a_ = _mm_mul_ps(wkr_, xr_); + const __m128 b_ = _mm_mul_ps(wki_, xi_); + const __m128 c_ = _mm_mul_ps(wkr_, xi_); + const __m128 d_ = _mm_mul_ps(wki_, xr_); + const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] -= yr; + // a[j2 + 1] -= yi; + // a[k2 + 0] += yr; + // a[k2 + 1] -= yi; + const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, + const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9, + const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, + const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121, + // Shuffle in right order and store. + const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); + // 2, 3, 4, 5, + const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); + // 6, 7, 8, 9, + const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); + // 122, 123, 120, 121, + const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); + // 126, 127, 124, 125, + const __m128 a_k2_0n = _mm_shuffle_ps( + a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123, + const __m128 a_k2_4n = _mm_shuffle_ps( + a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127, + _mm_storeu_ps(&a[0 + j2], a_j2_0n); + _mm_storeu_ps(&a[4 + j2], a_j2_4n); + _mm_storeu_ps(&a[122 - j2], a_k2_0n); + _mm_storeu_ps(&a[126 - j2], a_k2_4n); + } + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + k2 = 128 - j2; + k1 = 32 - j1; + wkr = 0.5f - c[k1]; + wki = c[j1]; + xr = a[j2 + 0] - a[k2 + 0]; + xi = a[j2 + 1] + a[k2 + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j2 + 0] -= yr; + a[j2 + 1] -= yi; + a[k2 + 0] += yr; + a[k2 + 1] -= yi; + } +} + +void rftbsub_128_SSE2(float* a) { + const float* c = rdft_w + 32; + int j1, j2, k1, k2; + float wkr, wki, xr, xi, yr, yi; + + static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, + 0.5f}; + const __m128 mm_half = _mm_load_ps(k_half); + + a[1] = -a[1]; + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, + const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, + const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, + const __m128 wkr_ = + _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, + const __m128 wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, + const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, + const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, + const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, + const __m128 a_j2_p0 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8, + const __m128 a_j2_p1 = _mm_shuffle_ps( + a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9, + const __m128 a_k2_p0 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120, + const __m128 a_k2_p1 = _mm_shuffle_ps( + a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121, + // Calculate 'x'. + const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr + wki * xi; + // yi = wkr * xi - wki * xr; + const __m128 a_ = _mm_mul_ps(wkr_, xr_); + const __m128 b_ = _mm_mul_ps(wki_, xi_); + const __m128 c_ = _mm_mul_ps(wkr_, xi_); + const __m128 d_ = _mm_mul_ps(wki_, xr_); + const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] = a[j2 + 0] - yr; + // a[j2 + 1] = yi - a[j2 + 1]; + // a[k2 + 0] = yr + a[k2 + 0]; + // a[k2 + 1] = yi - a[k2 + 1]; + const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, + const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9, + const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, + const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, + // Shuffle in right order and store. + const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); + // 2, 3, 4, 5, + const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); + // 6, 7, 8, 9, + const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); + // 122, 123, 120, 121, + const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); + // 126, 127, 124, 125, + const __m128 a_k2_0n = _mm_shuffle_ps( + a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123, + const __m128 a_k2_4n = _mm_shuffle_ps( + a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127, + _mm_storeu_ps(&a[0 + j2], a_j2_0n); + _mm_storeu_ps(&a[4 + j2], a_j2_4n); + _mm_storeu_ps(&a[122 - j2], a_k2_0n); + _mm_storeu_ps(&a[126 - j2], a_k2_4n); + } + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + k2 = 128 - j2; + k1 = 32 - j1; + wkr = 0.5f - c[k1]; + wki = c[j1]; + xr = a[j2 + 0] - a[k2 + 0]; + xi = a[j2 + 1] + a[k2 + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j2 + 0] = a[j2 + 0] - yr; + a[j2 + 1] = yi - a[j2 + 1]; + a[k2 + 0] = yr + a[k2 + 0]; + a[k2 + 1] = yi - a[k2 + 1]; + } + a[65] = -a[65]; +} +#endif + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h new file mode 100644 index 0000000000..47d076ea2a --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_ + +#include "modules/audio_processing/utility/ooura_fft.h" + +namespace webrtc { + +// This tables used to be computed at run-time. For example, refer to: +// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564 +// to see the initialization code. +// Constants shared by all paths (C, SSE2, NEON). +const float rdft_w[64] = { + 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, 0.9238795638f, + 0.3826834559f, 0.3826834559f, 0.9238795638f, 0.9807852507f, 0.1950903237f, + 0.5555702448f, 0.8314695954f, 0.8314695954f, 0.5555702448f, 0.1950903237f, + 0.9807852507f, 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f, + 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, 0.9569403529f, + 0.2902846634f, 0.4713967443f, 0.8819212914f, 0.7730104327f, 0.6343933344f, + 0.0980171412f, 0.9951847196f, 0.7071067691f, 0.4993977249f, 0.4975923598f, + 0.4945882559f, 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f, + 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, 0.4157347977f, + 0.4016037583f, 0.3865052164f, 0.3704755902f, 0.3535533845f, 0.3357794881f, + 0.3171966672f, 0.2978496552f, 0.2777851224f, 0.2570513785f, 0.2356983721f, + 0.2137775421f, 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f, + 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f, +}; + +// Constants used by the C and MIPS paths. +const float rdft_wk3ri_first[16] = { + 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f, + 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f, + 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f, + 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f, +}; +const float rdft_wk3ri_second[16] = { + -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f, + -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f, + -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f, + -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f, +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_COMMON_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h new file mode 100644 index 0000000000..1c44ae7197 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_ + +#include "modules/audio_processing/utility/ooura_fft.h" + +#ifdef _MSC_VER /* visual c++ */ +#define ALIGN16_BEG __declspec(align(16)) +#define ALIGN16_END +#else /* gcc or icc */ +#define ALIGN16_BEG +#define ALIGN16_END __attribute__((aligned(16))) +#endif + +namespace webrtc { + +// These tables used to be computed at run-time. For example, refer to: +// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564 +// to see the initialization code. +#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON) +// Constants used by SSE2 and NEON but initialized in the C path. +const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; + +ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = { + 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f, + 0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f, + 0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f, + 0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f, + 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f, + 0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f, + 0.098017141f, 0.098017141f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = { + 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 0.707106769f, + 0.707106769f, -0.707106769f, -0.707106769f, 0.923879564f, 0.923879564f, + -0.382683456f, -0.382683456f, 0.382683456f, 0.382683456f, -0.923879564f, + -0.923879564f, 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f, + 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 0.831469595f, + 0.831469595f, -0.555570245f, -0.555570245f, 0.195090324f, 0.195090324f, + -0.980785251f, -0.980785251f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = { + 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 0.382683456f, + 0.382683456f, -0.923879564f, -0.923879564f, 0.831469536f, 0.831469536f, + -0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f, + -0.555570245f, 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f, + 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 0.634393334f, + 0.634393334f, -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f, + -0.290284693f, -0.290284693f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = { + -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.382683456f, + 0.382683456f, -0.923879564f, 0.923879564f, -0.195090324f, 0.195090324f, + -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.980785251f, + 0.980785251f, -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f, + -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, -0.290284663f, + 0.290284663f, -0.881921291f, 0.881921291f, -0.634393334f, 0.634393334f, + -0.995184720f, 0.995184720f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = { + -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, -0.707106769f, + 0.707106769f, -0.707106769f, 0.707106769f, -0.382683456f, 0.382683456f, + -0.923879564f, 0.923879564f, -0.923879564f, 0.923879564f, -0.382683456f, + 0.382683456f, -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f, + -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.555570245f, + 0.555570245f, -0.831469595f, 0.831469595f, -0.980785251f, 0.980785251f, + -0.195090324f, 0.195090324f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = { + -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.923879564f, + 0.923879564f, 0.382683456f, -0.382683456f, -0.555570245f, 0.555570245f, + -0.195090353f, 0.195090353f, -0.980785251f, 0.980785251f, 0.831469536f, + -0.831469536f, -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f, + -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, -0.773010492f, + 0.773010492f, 0.098017156f, -0.098017156f, -0.881921172f, 0.881921172f, + 0.956940353f, -0.956940353f, +}; +ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = { + 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f, +}; +#endif + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_ |