diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility')
7 files changed, 1420 insertions, 0 deletions
diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc new file mode 100644 index 0000000000..0e696d9fff --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/intelligibility/intelligibility_enhancer.h" + +#include <math.h> +#include <stdlib.h> +#include <algorithm> +#include <limits> +#include <numeric> + +#include "common_audio/include/audio_util.h" +#include "common_audio/window_generator.h" +#include "rtc_base/checks.h" +#include "rtc_base/logging.h" +#include "rtc_base/numerics/safe_minmax.h" + +namespace webrtc { + +namespace { + +const size_t kErbResolution = 2; +const int kWindowSizeMs = 16; +const int kChunkSizeMs = 10; // Size provided by APM. +const float kClipFreqKhz = 0.2f; +const float kKbdAlpha = 1.5f; +const float kLambdaBot = -1.f; // Extreme values in bisection +const float kLambdaTop = -1e-5f; // search for lamda. +const float kVoiceProbabilityThreshold = 0.5f; +// Number of chunks after voice activity which is still considered speech. +const size_t kSpeechOffsetDelay = 10; +const float kDecayRate = 0.995f; // Power estimation decay rate. +const float kMaxRelativeGainChange = 0.005f; +const float kRho = 0.0004f; // Default production and interpretation SNR. +const float kPowerNormalizationFactor = 1.f / (1 << 30); +const float kMaxActiveSNR = 128.f; // 21dB +const float kMinInactiveSNR = 32.f; // 15dB +const size_t kGainUpdatePeriod = 10u; + +// Returns dot product of vectors |a| and |b| with size |length|. +float DotProduct(const float* a, const float* b, size_t length) { + float ret = 0.f; + for (size_t i = 0; i < length; ++i) { + ret += a[i] * b[i]; + } + return ret; +} + +// Computes the power across ERB bands from the power spectral density |pow|. +// Stores it in |result|. +void MapToErbBands(const float* pow, + const std::vector<std::vector<float>>& filter_bank, + float* result) { + for (size_t i = 0; i < filter_bank.size(); ++i) { + RTC_DCHECK_GT(filter_bank[i].size(), 0); + result[i] = kPowerNormalizationFactor * + DotProduct(filter_bank[i].data(), pow, filter_bank[i].size()); + } +} + +} // namespace + +IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, + size_t num_render_channels, + size_t num_bands, + size_t num_noise_bins) + : freqs_(RealFourier::ComplexLength( + RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), + num_noise_bins_(num_noise_bins), + chunk_length_(static_cast<size_t>(sample_rate_hz * kChunkSizeMs / 1000)), + bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), + sample_rate_hz_(sample_rate_hz), + num_render_channels_(num_render_channels), + clear_power_estimator_(freqs_, kDecayRate), + noise_power_estimator_(num_noise_bins, kDecayRate), + filtered_clear_pow_(bank_size_, 0.f), + filtered_noise_pow_(num_noise_bins, 0.f), + center_freqs_(bank_size_), + capture_filter_bank_(CreateErbBank(num_noise_bins)), + render_filter_bank_(CreateErbBank(freqs_)), + gains_eq_(bank_size_), + gain_applier_(freqs_, kMaxRelativeGainChange), + audio_s16_(chunk_length_), + chunks_since_voice_(kSpeechOffsetDelay), + is_speech_(false), + snr_(kMaxActiveSNR), + is_active_(false), + num_chunks_(0u), + num_active_chunks_(0u), + noise_estimation_buffer_(num_noise_bins), + noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, + std::vector<float>(num_noise_bins), + RenderQueueItemVerifier<float>(num_noise_bins)) { + RTC_DCHECK_LE(kRho, 1.f); + + const size_t erb_index = static_cast<size_t>( + ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + + 43.f)); + start_freq_ = std::max(static_cast<size_t>(1), erb_index * kErbResolution); + + size_t window_size = static_cast<size_t>(1) << RealFourier::FftOrder(freqs_); + std::vector<float> kbd_window(window_size); + WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, + kbd_window.data()); + render_mangler_.reset(new LappedTransform( + num_render_channels_, num_render_channels_, chunk_length_, + kbd_window.data(), window_size, window_size / 2, this)); + + const size_t initial_delay = render_mangler_->initial_delay(); + for (size_t i = 0u; i < num_bands - 1; ++i) { + high_bands_buffers_.push_back(std::unique_ptr<intelligibility::DelayBuffer>( + new intelligibility::DelayBuffer(initial_delay, num_render_channels_))); + } +} + +IntelligibilityEnhancer::~IntelligibilityEnhancer() { + // Don't rely on this log, since the destructor isn't called when the + // app/tab is killed. + if (num_chunks_ > 0) { + RTC_LOG(LS_INFO) << "Intelligibility Enhancer was active for " + << 100.f * static_cast<float>(num_active_chunks_) / + num_chunks_ + << "% of the call."; + } else { + RTC_LOG(LS_INFO) << "Intelligibility Enhancer processed no chunk."; + } +} + +void IntelligibilityEnhancer::SetCaptureNoiseEstimate( + std::vector<float> noise, float gain) { + RTC_DCHECK_EQ(noise.size(), num_noise_bins_); + for (auto& bin : noise) { + bin *= gain; + } + // Disregarding return value since buffer overflow is acceptable, because it + // is not critical to get each noise estimate. + if (noise_estimation_queue_.Insert(&noise)) { + }; +} + +void IntelligibilityEnhancer::ProcessRenderAudio(AudioBuffer* audio) { + RTC_DCHECK_EQ(num_render_channels_, audio->num_channels()); + while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) { + noise_power_estimator_.Step(noise_estimation_buffer_.data()); + } + float* const* low_band = audio->split_channels_f(kBand0To8kHz); + is_speech_ = IsSpeech(low_band[0]); + render_mangler_->ProcessChunk(low_band, low_band); + DelayHighBands(audio); +} + +void IntelligibilityEnhancer::ProcessAudioBlock( + const std::complex<float>* const* in_block, + size_t in_channels, + size_t frames, + size_t /* out_channels */, + std::complex<float>* const* out_block) { + RTC_DCHECK_EQ(freqs_, frames); + if (is_speech_) { + clear_power_estimator_.Step(in_block[0]); + } + SnrBasedEffectActivation(); + ++num_chunks_; + if (is_active_) { + ++num_active_chunks_; + if (num_chunks_ % kGainUpdatePeriod == 0) { + MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_, + filtered_clear_pow_.data()); + MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_, + filtered_noise_pow_.data()); + SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); + const float power_target = std::accumulate( + filtered_clear_pow_.data(), + filtered_clear_pow_.data() + bank_size_, + 0.f); + const float power_top = + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); + SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); + const float power_bot = + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); + if (power_target >= power_bot && power_target <= power_top) { + SolveForLambda(power_target); + UpdateErbGains(); + } // Else experiencing power underflow, so do nothing. + } + } + for (size_t i = 0; i < in_channels; ++i) { + gain_applier_.Apply(in_block[i], out_block[i]); + } +} + +void IntelligibilityEnhancer::SnrBasedEffectActivation() { + const float* clear_psd = clear_power_estimator_.power().data(); + const float* noise_psd = noise_power_estimator_.power().data(); + const float clear_power = + std::accumulate(clear_psd, clear_psd + freqs_, 0.f); + const float noise_power = + std::accumulate(noise_psd, noise_psd + freqs_, 0.f); + snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / + (noise_power + std::numeric_limits<float>::epsilon()); + if (is_active_) { + if (snr_ > kMaxActiveSNR) { + RTC_LOG(LS_INFO) << "Intelligibility Enhancer was deactivated at chunk " + << num_chunks_; + is_active_ = false; + // Set the target gains to unity. + float* gains = gain_applier_.target(); + for (size_t i = 0; i < freqs_; ++i) { + gains[i] = 1.f; + } + } + } else { + if (snr_ < kMinInactiveSNR) { + RTC_LOG(LS_INFO) << "Intelligibility Enhancer was activated at chunk " + << num_chunks_; + is_active_ = true; + } + } +} + +void IntelligibilityEnhancer::SolveForLambda(float power_target) { + const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values + const int kMaxIters = 100; // for these, based on experiments. + + const float reciprocal_power_target = + 1.f / (power_target + std::numeric_limits<float>::epsilon()); + float lambda_bot = kLambdaBot; + float lambda_top = kLambdaTop; + float power_ratio = 2.f; // Ratio of achieved power to target power. + int iters = 0; + while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { + const float lambda = (lambda_bot + lambda_top) / 2.f; + SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data()); + const float power = + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); + if (power < power_target) { + lambda_bot = lambda; + } else { + lambda_top = lambda; + } + power_ratio = std::fabs(power * reciprocal_power_target); + ++iters; + } +} + +void IntelligibilityEnhancer::UpdateErbGains() { + // (ERB gain) = filterbank' * (freq gain) + float* gains = gain_applier_.target(); + for (size_t i = 0; i < freqs_; ++i) { + gains[i] = 0.f; + for (size_t j = 0; j < bank_size_; ++j) { + gains[i] += render_filter_bank_[j][i] * gains_eq_[j]; + } + } +} + +size_t IntelligibilityEnhancer::GetBankSize(int sample_rate, + size_t erb_resolution) { + float freq_limit = sample_rate / 2000.f; + size_t erb_scale = static_cast<size_t>(ceilf( + 11.17f * logf((freq_limit + 0.312f) / (freq_limit + 14.6575f)) + 43.f)); + return erb_scale * erb_resolution; +} + +std::vector<std::vector<float>> IntelligibilityEnhancer::CreateErbBank( + size_t num_freqs) { + std::vector<std::vector<float>> filter_bank(bank_size_); + size_t lf = 1, rf = 4; + + for (size_t i = 0; i < bank_size_; ++i) { + float abs_temp = fabsf((i + 1.f) / static_cast<float>(kErbResolution)); + center_freqs_[i] = 676170.4f / (47.06538f - expf(0.08950404f * abs_temp)); + center_freqs_[i] -= 14678.49f; + } + float last_center_freq = center_freqs_[bank_size_ - 1]; + for (size_t i = 0; i < bank_size_; ++i) { + center_freqs_[i] *= 0.5f * sample_rate_hz_ / last_center_freq; + } + + for (size_t i = 0; i < bank_size_; ++i) { + filter_bank[i].resize(num_freqs); + } + + for (size_t i = 1; i <= bank_size_; ++i) { + size_t lll = static_cast<size_t>( + round(center_freqs_[rtc::SafeMax<size_t>(1, i - lf) - 1] * num_freqs / + (0.5f * sample_rate_hz_))); + size_t ll = static_cast<size_t>( + round(center_freqs_[rtc::SafeMax<size_t>(1, i) - 1] * num_freqs / + (0.5f * sample_rate_hz_))); + lll = rtc::SafeClamp<size_t>(lll, 1, num_freqs) - 1; + ll = rtc::SafeClamp<size_t>(ll, 1, num_freqs) - 1; + + size_t rrr = static_cast<size_t>( + round(center_freqs_[rtc::SafeMin<size_t>(bank_size_, i + rf) - 1] * + num_freqs / (0.5f * sample_rate_hz_))); + size_t rr = static_cast<size_t>( + round(center_freqs_[rtc::SafeMin<size_t>(bank_size_, i + 1) - 1] * + num_freqs / (0.5f * sample_rate_hz_))); + rrr = rtc::SafeClamp<size_t>(rrr, 1, num_freqs) - 1; + rr = rtc::SafeClamp<size_t>(rr, 1, num_freqs) - 1; + + float step = ll == lll ? 0.f : 1.f / (ll - lll); + float element = 0.f; + for (size_t j = lll; j <= ll; ++j) { + filter_bank[i - 1][j] = element; + element += step; + } + step = rr == rrr ? 0.f : 1.f / (rrr - rr); + element = 1.f; + for (size_t j = rr; j <= rrr; ++j) { + filter_bank[i - 1][j] = element; + element -= step; + } + for (size_t j = ll; j <= rr; ++j) { + filter_bank[i - 1][j] = 1.f; + } + } + + for (size_t i = 0; i < num_freqs; ++i) { + float sum = 0.f; + for (size_t j = 0; j < bank_size_; ++j) { + sum += filter_bank[j][i]; + } + for (size_t j = 0; j < bank_size_; ++j) { + filter_bank[j][i] /= sum; + } + } + return filter_bank; +} + +void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, + size_t start_freq, + float* sols) { + const float kMinPower = 1e-5f; + + const float* pow_x0 = filtered_clear_pow_.data(); + const float* pow_n0 = filtered_noise_pow_.data(); + + for (size_t n = 0; n < start_freq; ++n) { + sols[n] = 1.f; + } + + // Analytic solution for optimal gains. See paper for derivation. + for (size_t n = start_freq; n < bank_size_; ++n) { + if (pow_x0[n] < kMinPower || pow_n0[n] < kMinPower) { + sols[n] = 1.f; + } else { + const float gamma0 = 0.5f * kRho * pow_x0[n] * pow_n0[n] + + lambda * pow_x0[n] * pow_n0[n] * pow_n0[n]; + const float beta0 = + lambda * pow_x0[n] * (2.f - kRho) * pow_x0[n] * pow_n0[n]; + const float alpha0 = + lambda * pow_x0[n] * (1.f - kRho) * pow_x0[n] * pow_x0[n]; + RTC_DCHECK_LT(alpha0, 0.f); + // The quadratic equation should always have real roots, but to guard + // against numerical errors we limit it to a minimum of zero. + sols[n] = std::max( + 0.f, (-beta0 - std::sqrt(std::max( + 0.f, beta0 * beta0 - 4.f * alpha0 * gamma0))) / + (2.f * alpha0)); + } + } +} + +bool IntelligibilityEnhancer::IsSpeech(const float* audio) { + FloatToS16(audio, chunk_length_, audio_s16_.data()); + vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); + if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { + chunks_since_voice_ = 0; + } else if (chunks_since_voice_ < kSpeechOffsetDelay) { + ++chunks_since_voice_; + } + return chunks_since_voice_ < kSpeechOffsetDelay; +} + +void IntelligibilityEnhancer::DelayHighBands(AudioBuffer* audio) { + RTC_DCHECK_EQ(audio->num_bands(), high_bands_buffers_.size() + 1); + for (size_t i = 0u; i < high_bands_buffers_.size(); ++i) { + Band band = static_cast<Band>(i + 1); + high_bands_buffers_[i]->Delay(audio->split_channels_f(band), chunk_length_); + } +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h new file mode 100644 index 0000000000..3e0e269c58 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ +#define MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ + +#include <complex> +#include <memory> +#include <vector> + +#include "common_audio/channel_buffer.h" +#include "common_audio/lapped_transform.h" +#include "modules/audio_processing/audio_buffer.h" +#include "modules/audio_processing/intelligibility/intelligibility_utils.h" +#include "modules/audio_processing/render_queue_item_verifier.h" +#include "modules/audio_processing/vad/voice_activity_detector.h" +#include "rtc_base/swap_queue.h" + +namespace webrtc { + +// Speech intelligibility enhancement module. Reads render and capture +// audio streams and modifies the render stream with a set of gains per +// frequency bin to enhance speech against the noise background. +// Details of the model and algorithm can be found in the original paper: +// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788 +class IntelligibilityEnhancer : public LappedTransform::Callback { + public: + IntelligibilityEnhancer(int sample_rate_hz, + size_t num_render_channels, + size_t num_bands, + size_t num_noise_bins); + + ~IntelligibilityEnhancer() override; + + // Sets the capture noise magnitude spectrum estimate. + void SetCaptureNoiseEstimate(std::vector<float> noise, float gain); + + // Reads chunk of speech in time domain and updates with modified signal. + void ProcessRenderAudio(AudioBuffer* audio); + bool active() const; + + protected: + // All in frequency domain, receives input |in_block|, applies + // intelligibility enhancement, and writes result to |out_block|. + void ProcessAudioBlock(const std::complex<float>* const* in_block, + size_t in_channels, + size_t frames, + size_t out_channels, + std::complex<float>* const* out_block) override; + + private: + FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestRenderUpdate); + FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation); + FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains); + FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, + TestNoiseGainHasExpectedResult); + FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, + TestAllBandsHaveSameDelay); + + // Updates the SNR estimation and enables or disables this component using a + // hysteresis. + void SnrBasedEffectActivation(); + + // Bisection search for optimal |lambda|. + void SolveForLambda(float power_target); + + // Transforms freq gains to ERB gains. + void UpdateErbGains(); + + // Returns number of ERB filters. + static size_t GetBankSize(int sample_rate, size_t erb_resolution); + + // Initializes ERB filterbank. + std::vector<std::vector<float>> CreateErbBank(size_t num_freqs); + + // Analytically solves quadratic for optimal gains given |lambda|. + // Negative gains are set to 0. Stores the results in |sols|. + void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols); + + // Returns true if the audio is speech. + bool IsSpeech(const float* audio); + + // Delays the high bands to compensate for the processing delay in the low + // band. + void DelayHighBands(AudioBuffer* audio); + + static const size_t kMaxNumNoiseEstimatesToBuffer = 5; + + const size_t freqs_; // Num frequencies in frequency domain. + const size_t num_noise_bins_; + const size_t chunk_length_; // Chunk size in samples. + const size_t bank_size_; // Num ERB filters. + const int sample_rate_hz_; + const size_t num_render_channels_; + + intelligibility::PowerEstimator<std::complex<float>> clear_power_estimator_; + intelligibility::PowerEstimator<float> noise_power_estimator_; + std::vector<float> filtered_clear_pow_; + std::vector<float> filtered_noise_pow_; + std::vector<float> center_freqs_; + std::vector<std::vector<float>> capture_filter_bank_; + std::vector<std::vector<float>> render_filter_bank_; + size_t start_freq_; + + std::vector<float> gains_eq_; // Pre-filter modified gains. + intelligibility::GainApplier gain_applier_; + + std::unique_ptr<LappedTransform> render_mangler_; + + VoiceActivityDetector vad_; + std::vector<int16_t> audio_s16_; + size_t chunks_since_voice_; + bool is_speech_; + float snr_; + bool is_active_; + + unsigned long int num_chunks_; + unsigned long int num_active_chunks_; + + std::vector<float> noise_estimation_buffer_; + SwapQueue<std::vector<float>, RenderQueueItemVerifier<float>> + noise_estimation_queue_; + + std::vector<std::unique_ptr<intelligibility::DelayBuffer>> + high_bands_buffers_; +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc new file mode 100644 index 0000000000..98a8dae469 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> +#include <stdlib.h> + +#include <algorithm> +#include <memory> +#include <vector> + +#include "api/array_view.h" +#include "common_audio/signal_processing/include/signal_processing_library.h" +#include "modules/audio_processing/audio_buffer.h" +#include "modules/audio_processing/intelligibility/intelligibility_enhancer.h" +#include "modules/audio_processing/noise_suppression_impl.h" +#include "modules/audio_processing/test/audio_buffer_tools.h" +#include "modules/audio_processing/test/bitexactness_tools.h" +#include "rtc_base/arraysize.h" +#include "test/gtest.h" + +namespace webrtc { + +namespace { + +// Target output for ERB create test. Generated with matlab. +const float kTestCenterFreqs[] = { + 14.5213f, 29.735f, 45.6781f, 62.3884f, 79.9058f, 98.2691f, 117.521f, + 137.708f, 158.879f, 181.084f, 204.378f, 228.816f, 254.459f, 281.371f, + 309.618f, 339.273f, 370.411f, 403.115f, 437.469f, 473.564f, 511.497f, + 551.371f, 593.293f, 637.386f, 683.77f, 732.581f, 783.96f, 838.06f, + 895.046f, 955.09f, 1018.38f, 1085.13f, 1155.54f, 1229.85f, 1308.32f, + 1391.22f, 1478.83f, 1571.5f, 1669.55f, 1773.37f, 1883.37f, 2000.f}; +const float kTestFilterBank[][33] = { + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.2f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.25f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.25f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.25f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.25f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.142857f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.285714f, 0.157895f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.210526f, 0.117647f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, 0.315789f, 0.176471f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.315789f, 0.352941f, 0.142857f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.352941f, 0.285714f, + 0.157895f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.285714f, + 0.210526f, 0.111111f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.285714f, 0.315789f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.315789f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, 0.111111f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, 0.222222f, + 0.108108f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.333333f, + 0.243243f, 0.153846f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.333333f, + 0.324324f, 0.230769f, 0.166667f, 0.0909091f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.324324f, 0.307692f, 0.25f, 0.181818f, 0.0833333f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.307692f, 0.333333f, + 0.363636f, 0.25f, 0.151515f, 0.0793651f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.166667f, 0.363636f, 0.333333f, 0.242424f, + 0.190476f, 0.133333f, 0.0689655f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.333333f, 0.30303f, 0.253968f, 0.2f, 0.137931f, + 0.0714286f, 0.f, 0.f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.30303f, 0.31746f, 0.333333f, 0.275862f, 0.214286f, + 0.125f, 0.0655738f, 0.f, 0.f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.15873f, 0.333333f, 0.344828f, 0.357143f, + 0.25f, 0.196721f, 0.137931f, 0.0816327f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.172414f, 0.357143f, + 0.3125f, 0.245902f, 0.172414f, 0.102041f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.3125f, 0.327869f, 0.344828f, 0.204082f, 0.f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.163934f, 0.344828f, 0.408163f, 0.5f}, + {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.204082f, 0.5f}}; +static_assert(arraysize(kTestCenterFreqs) == arraysize(kTestFilterBank), + "Test filterbank badly initialized."); + +// Target output for gain solving test. Generated with matlab. +const size_t kTestStartFreq = 12; // Lowest integral frequency for ERBs. +const float kTestZeroVar = 1.f; +const float kTestNonZeroVarLambdaTop[] = { + 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; +static_assert(arraysize(kTestCenterFreqs) == + arraysize(kTestNonZeroVarLambdaTop), + "Power test data badly initialized."); +const float kMaxTestError = 0.005f; + +// Enhancer initialization parameters. +const int kSamples = 10000; +const int kSampleRate = 4000; +const int kNumChannels = 1; +const int kFragmentSize = kSampleRate / 100; +const size_t kNumNoiseBins = 129; +const size_t kNumBands = 1; + +// Number of frames to process in the bitexactness tests. +const size_t kNumFramesToProcess = 1000; + +int IntelligibilityEnhancerSampleRate(int sample_rate_hz) { + return (sample_rate_hz > AudioProcessing::kSampleRate16kHz + ? AudioProcessing::kSampleRate16kHz + : sample_rate_hz); +} + +// Process one frame of data and produce the output. +void ProcessOneFrame(int sample_rate_hz, + AudioBuffer* render_audio_buffer, + AudioBuffer* capture_audio_buffer, + NoiseSuppressionImpl* noise_suppressor, + IntelligibilityEnhancer* intelligibility_enhancer) { + if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) { + render_audio_buffer->SplitIntoFrequencyBands(); + capture_audio_buffer->SplitIntoFrequencyBands(); + } + + intelligibility_enhancer->ProcessRenderAudio(render_audio_buffer); + + noise_suppressor->AnalyzeCaptureAudio(capture_audio_buffer); + noise_suppressor->ProcessCaptureAudio(capture_audio_buffer); + + intelligibility_enhancer->SetCaptureNoiseEstimate( + noise_suppressor->NoiseEstimate(), 0); + + if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) { + render_audio_buffer->MergeFrequencyBands(); + } +} + +// Processes a specified amount of frames, verifies the results and reports +// any errors. +void RunBitexactnessTest(int sample_rate_hz, + size_t num_channels, + rtc::ArrayView<const float> output_reference) { + const StreamConfig render_config(sample_rate_hz, num_channels, false); + AudioBuffer render_buffer( + render_config.num_frames(), render_config.num_channels(), + render_config.num_frames(), render_config.num_channels(), + render_config.num_frames()); + test::InputAudioFile render_file( + test::GetApmRenderTestVectorFileName(sample_rate_hz)); + std::vector<float> render_input(render_buffer.num_frames() * + render_buffer.num_channels()); + + const StreamConfig capture_config(sample_rate_hz, num_channels, false); + AudioBuffer capture_buffer( + capture_config.num_frames(), capture_config.num_channels(), + capture_config.num_frames(), capture_config.num_channels(), + capture_config.num_frames()); + test::InputAudioFile capture_file( + test::GetApmCaptureTestVectorFileName(sample_rate_hz)); + std::vector<float> capture_input(render_buffer.num_frames() * + capture_buffer.num_channels()); + + rtc::CriticalSection crit_capture; + NoiseSuppressionImpl noise_suppressor(&crit_capture); + noise_suppressor.Initialize(capture_config.num_channels(), sample_rate_hz); + noise_suppressor.Enable(true); + + IntelligibilityEnhancer intelligibility_enhancer( + IntelligibilityEnhancerSampleRate(sample_rate_hz), + render_config.num_channels(), kNumBands, + NoiseSuppressionImpl::num_noise_bins()); + + for (size_t frame_no = 0u; frame_no < kNumFramesToProcess; ++frame_no) { + ReadFloatSamplesFromStereoFile(render_buffer.num_frames(), + render_buffer.num_channels(), &render_file, + render_input); + ReadFloatSamplesFromStereoFile(capture_buffer.num_frames(), + capture_buffer.num_channels(), &capture_file, + capture_input); + + test::CopyVectorToAudioBuffer(render_config, render_input, &render_buffer); + test::CopyVectorToAudioBuffer(capture_config, capture_input, + &capture_buffer); + + ProcessOneFrame(sample_rate_hz, &render_buffer, &capture_buffer, + &noise_suppressor, &intelligibility_enhancer); + } + + // Extract and verify the test results. + std::vector<float> render_output; + test::ExtractVectorFromAudioBuffer(render_config, &render_buffer, + &render_output); + + const float kElementErrorBound = 1.f / static_cast<float>(1 << 15); + + // Compare the output with the reference. Only the first values of the output + // from last frame processed are compared in order not having to specify all + // preceeding frames as testvectors. As the algorithm being tested has a + // memory, testing only the last frame implicitly also tests the preceeding + // frames. + EXPECT_TRUE(test::VerifyDeinterleavedArray( + render_buffer.num_frames(), render_config.num_channels(), + output_reference, render_output, kElementErrorBound)); +} + +float float_rand() { + return std::rand() * 2.f / RAND_MAX - 1; +} + +} // namespace + +class IntelligibilityEnhancerTest : public ::testing::Test { + protected: + IntelligibilityEnhancerTest() + : clear_buffer_(kFragmentSize, + kNumChannels, + kFragmentSize, + kNumChannels, + kFragmentSize), + stream_config_(kSampleRate, kNumChannels), + clear_data_(kSamples), + noise_data_(kNumNoiseBins), + orig_data_(kSamples) { + std::srand(1); + enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumBands, + kNumNoiseBins)); + } + + bool CheckUpdate() { + enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumBands, + kNumNoiseBins)); + float* clear_cursor = clear_data_.data(); + for (int i = 0; i < kSamples; i += kFragmentSize) { + enh_->SetCaptureNoiseEstimate(noise_data_, 1); + clear_buffer_.CopyFrom(&clear_cursor, stream_config_); + enh_->ProcessRenderAudio(&clear_buffer_); + clear_buffer_.CopyTo(stream_config_, &clear_cursor); + clear_cursor += kFragmentSize; + } + for (int i = initial_delay_; i < kSamples; i++) { + if (std::fabs(clear_data_[i] - orig_data_[i - initial_delay_]) > + kMaxTestError) { + return true; + } + } + return false; + } + + std::unique_ptr<IntelligibilityEnhancer> enh_; + // Render clean speech buffer. + AudioBuffer clear_buffer_; + StreamConfig stream_config_; + std::vector<float> clear_data_; + std::vector<float> noise_data_; + std::vector<float> orig_data_; + size_t initial_delay_; +}; + +// For each class of generated data, tests that render stream is updated when +// it should be. +TEST_F(IntelligibilityEnhancerTest, TestRenderUpdate) { + initial_delay_ = enh_->render_mangler_->initial_delay(); + std::fill(noise_data_.begin(), noise_data_.end(), 0.f); + std::fill(orig_data_.begin(), orig_data_.end(), 0.f); + std::fill(clear_data_.begin(), clear_data_.end(), 0.f); + EXPECT_FALSE(CheckUpdate()); + std::generate(clear_data_.begin(), clear_data_.end(), float_rand); + orig_data_ = clear_data_; + EXPECT_FALSE(CheckUpdate()); + std::generate(clear_data_.begin(), clear_data_.end(), float_rand); + orig_data_ = clear_data_; + std::generate(noise_data_.begin(), noise_data_.end(), float_rand); + FloatToFloatS16(noise_data_.data(), noise_data_.size(), noise_data_.data()); + EXPECT_TRUE(CheckUpdate()); +} + +// Tests ERB bank creation, comparing against matlab output. +TEST_F(IntelligibilityEnhancerTest, TestErbCreation) { + ASSERT_EQ(arraysize(kTestCenterFreqs), enh_->bank_size_); + for (size_t i = 0; i < enh_->bank_size_; ++i) { + EXPECT_NEAR(kTestCenterFreqs[i], enh_->center_freqs_[i], kMaxTestError); + ASSERT_EQ(arraysize(kTestFilterBank[0]), enh_->freqs_); + for (size_t j = 0; j < enh_->freqs_; ++j) { + EXPECT_NEAR(kTestFilterBank[i][j], enh_->render_filter_bank_[i][j], + kMaxTestError); + } + } +} + +// Tests analytic solution for optimal gains, comparing +// against matlab output. +TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) { + ASSERT_EQ(kTestStartFreq, enh_->start_freq_); + std::vector<float> sols(enh_->bank_size_); + float lambda = -0.001f; + for (size_t i = 0; i < enh_->bank_size_; i++) { + enh_->filtered_clear_pow_[i] = 0.f; + enh_->filtered_noise_pow_[i] = 0.f; + } + enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data()); + for (size_t i = 0; i < enh_->bank_size_; i++) { + EXPECT_NEAR(kTestZeroVar, sols[i], kMaxTestError); + } + for (size_t i = 0; i < enh_->bank_size_; i++) { + enh_->filtered_clear_pow_[i] = static_cast<float>(i + 1); + enh_->filtered_noise_pow_[i] = static_cast<float>(enh_->bank_size_ - i); + } + enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data()); + for (size_t i = 0; i < enh_->bank_size_; i++) { + EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError); + } + lambda = -1.f; + enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data()); + for (size_t i = 0; i < enh_->bank_size_; i++) { + EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError); + } +} + +TEST_F(IntelligibilityEnhancerTest, TestNoiseGainHasExpectedResult) { + const float kGain = 2.f; + const float kTolerance = 0.007f; + std::vector<float> noise(kNumNoiseBins); + std::vector<float> noise_psd(kNumNoiseBins); + std::generate(noise.begin(), noise.end(), float_rand); + for (size_t i = 0; i < kNumNoiseBins; ++i) { + noise_psd[i] = kGain * kGain * noise[i] * noise[i]; + } + float* clear_cursor = clear_data_.data(); + for (size_t i = 0; i < kNumFramesToProcess; ++i) { + enh_->SetCaptureNoiseEstimate(noise, kGain); + clear_buffer_.CopyFrom(&clear_cursor, stream_config_); + enh_->ProcessRenderAudio(&clear_buffer_); + } + const std::vector<float>& estimated_psd = + enh_->noise_power_estimator_.power(); + for (size_t i = 0; i < kNumNoiseBins; ++i) { + EXPECT_LT(std::abs(estimated_psd[i] - noise_psd[i]) / noise_psd[i], + kTolerance); + } +} + +TEST_F(IntelligibilityEnhancerTest, TestAllBandsHaveSameDelay) { + const int kTestSampleRate = AudioProcessing::kSampleRate32kHz; + const int kTestSplitRate = AudioProcessing::kSampleRate16kHz; + const size_t kTestNumBands = + rtc::CheckedDivExact(kTestSampleRate, kTestSplitRate); + const size_t kTestFragmentSize = rtc::CheckedDivExact(kTestSampleRate, 100); + const size_t kTestSplitFragmentSize = + rtc::CheckedDivExact(kTestSplitRate, 100); + enh_.reset(new IntelligibilityEnhancer(kTestSplitRate, kNumChannels, + kTestNumBands, kNumNoiseBins)); + size_t initial_delay = enh_->render_mangler_->initial_delay(); + std::vector<float> rand_gen_buf(kTestFragmentSize); + AudioBuffer original_buffer(kTestFragmentSize, kNumChannels, + kTestFragmentSize, kNumChannels, + kTestFragmentSize); + AudioBuffer audio_buffer(kTestFragmentSize, kNumChannels, kTestFragmentSize, + kNumChannels, kTestFragmentSize); + for (size_t i = 0u; i < kTestNumBands; ++i) { + std::generate(rand_gen_buf.begin(), rand_gen_buf.end(), float_rand); + original_buffer.split_data_f()->SetDataForTesting(rand_gen_buf.data(), + rand_gen_buf.size()); + audio_buffer.split_data_f()->SetDataForTesting(rand_gen_buf.data(), + rand_gen_buf.size()); + } + enh_->ProcessRenderAudio(&audio_buffer); + for (size_t i = 0u; i < kTestNumBands; ++i) { + const float* original_ptr = original_buffer.split_bands_const_f(0)[i]; + const float* audio_ptr = audio_buffer.split_bands_const_f(0)[i]; + for (size_t j = initial_delay; j < kTestSplitFragmentSize; ++j) { + EXPECT_LT(std::fabs(original_ptr[j - initial_delay] - audio_ptr[j]), + kMaxTestError); + } + } +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono8kHz) { + const float kOutputReference[] = {-0.001892f, -0.003296f, -0.001953f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate8kHz, 1, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono16kHz) { + const float kOutputReference[] = {-0.000977f, -0.003296f, -0.002441f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate16kHz, 1, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono32kHz) { + const float kOutputReference[] = {0.003021f, -0.011780f, -0.008209f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate32kHz, 1, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Mono48kHz) { + const float kOutputReference[] = {-0.027696f, -0.026253f, -0.018001f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate48kHz, 1, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Stereo8kHz) { + const float kOutputReference[] = {0.021454f, 0.035919f, 0.026428f, + -0.000641f, 0.000366f, 0.000641f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate8kHz, 2, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Stereo16kHz) { + const float kOutputReference[] = {0.021362f, 0.035736f, 0.023895f, + -0.001404f, -0.001465f, 0.000549f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate16kHz, 2, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Stereo32kHz) { + const float kOutputReference[] = {0.030641f, 0.027406f, 0.028321f, + -0.001343f, -0.004578f, 0.000977f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate32kHz, 2, kOutputReference); +} + +TEST(IntelligibilityEnhancerBitExactnessTest, DISABLED_Stereo48kHz) { + const float kOutputReference[] = {-0.009276f, -0.001601f, -0.008255f, + -0.012975f, -0.015940f, -0.017820f}; + + RunBitexactnessTest(AudioProcessing::kSampleRate48kHz, 2, kOutputReference); +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc new file mode 100644 index 0000000000..b6917f4407 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/intelligibility/intelligibility_utils.h" + +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <algorithm> +#include <limits> + +#include "rtc_base/numerics/safe_minmax.h" + +namespace webrtc { + +namespace intelligibility { + +namespace { + +const float kMinFactor = 0.01f; +const float kMaxFactor = 100.f; + +// Return |current| changed towards |target|, with the relative change being at +// most |limit|. +float UpdateFactor(float target, float current, float limit) { + const float gain = target / (current + std::numeric_limits<float>::epsilon()); + const float clamped_gain = rtc::SafeClamp(gain, 1 - limit, 1 + limit); + return rtc::SafeClamp(current * clamped_gain, kMinFactor, kMaxFactor); +} + +} // namespace + +template<typename T> +PowerEstimator<T>::PowerEstimator(size_t num_freqs, float decay) + : power_(num_freqs, 0.f), decay_(decay) {} + +template<typename T> +void PowerEstimator<T>::Step(const T* data) { + for (size_t i = 0; i < power_.size(); ++i) { + power_[i] = decay_ * power_[i] + + (1.f - decay_) * std::abs(data[i]) * std::abs(data[i]); + } +} + +template class PowerEstimator<float>; +template class PowerEstimator<std::complex<float>>; + +GainApplier::GainApplier(size_t freqs, float relative_change_limit) + : num_freqs_(freqs), + relative_change_limit_(relative_change_limit), + target_(freqs, 1.f), + current_(freqs, 1.f) {} + +GainApplier::~GainApplier() {} + +void GainApplier::Apply(const std::complex<float>* in_block, + std::complex<float>* out_block) { + for (size_t i = 0; i < num_freqs_; ++i) { + current_[i] = UpdateFactor(target_[i], current_[i], relative_change_limit_); + out_block[i] = sqrtf(fabsf(current_[i])) * in_block[i]; + } +} + +DelayBuffer::DelayBuffer(size_t delay, size_t num_channels) + : buffer_(num_channels, std::vector<float>(delay, 0.f)), read_index_(0u) {} + +DelayBuffer::~DelayBuffer() {} + +void DelayBuffer::Delay(float* const* data, size_t length) { + size_t sample_index = read_index_; + for (size_t i = 0u; i < buffer_.size(); ++i) { + sample_index = read_index_; + for (size_t j = 0u; j < length; ++j) { + float swap = data[i][j]; + data[i][j] = buffer_[i][sample_index]; + buffer_[i][sample_index] = swap; + if (++sample_index == buffer_.size()) { + sample_index = 0u; + } + } + } + read_index_ = sample_index; +} + +} // namespace intelligibility + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h new file mode 100644 index 0000000000..4dc17d50b5 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ +#define MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ + +#include <complex> +#include <vector> + +namespace webrtc { + +namespace intelligibility { + +// Internal helper for computing the power of a stream of arrays. +// The result is an array of power per position: the i-th power is the power of +// the stream of data on the i-th positions in the input arrays. +template <typename T> +class PowerEstimator { + public: + // Construct an instance for the given input array length (|freqs|), with the + // appropriate parameters. |decay| is the forgetting factor. + PowerEstimator(size_t freqs, float decay); + + // Add a new data point to the series. + void Step(const T* data); + + // The current power array. + const std::vector<float>& power() { return power_; }; + + private: + // The current power array. + std::vector<float> power_; + + const float decay_; +}; + +// Helper class for smoothing gain changes. On each application step, the +// currently used gains are changed towards a set of settable target gains, +// constrained by a limit on the relative changes. +class GainApplier { + public: + GainApplier(size_t freqs, float relative_change_limit); + + ~GainApplier(); + + // Copy |in_block| to |out_block|, multiplied by the current set of gains, + // and step the current set of gains towards the target set. + void Apply(const std::complex<float>* in_block, + std::complex<float>* out_block); + + // Return the current target gain set. Modify this array to set the targets. + float* target() { return target_.data(); } + + private: + const size_t num_freqs_; + const float relative_change_limit_; + std::vector<float> target_; + std::vector<float> current_; +}; + +// Helper class to delay a signal by an integer number of samples. +class DelayBuffer { + public: + DelayBuffer(size_t delay, size_t num_channels); + + ~DelayBuffer(); + + void Delay(float* const* data, size_t length); + + private: + std::vector<std::vector<float>> buffer_; + size_t read_index_; +}; + +} // namespace intelligibility + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc new file mode 100644 index 0000000000..fea394c338 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <cmath> +#include <complex> +#include <vector> + +#include "modules/audio_processing/intelligibility/intelligibility_utils.h" +#include "rtc_base/arraysize.h" +#include "test/gtest.h" + +namespace webrtc { + +namespace intelligibility { + +std::vector<std::vector<std::complex<float>>> GenerateTestData(size_t freqs, + size_t samples) { + std::vector<std::vector<std::complex<float>>> data(samples); + for (size_t i = 0; i < samples; ++i) { + for (size_t j = 0; j < freqs; ++j) { + const float val = 0.99f / ((i + 1) * (j + 1)); + data[i].push_back(std::complex<float>(val, val)); + } + } + return data; +} + +// Tests PowerEstimator, for all power step types. +TEST(IntelligibilityUtilsTest, TestPowerEstimator) { + const size_t kFreqs = 10; + const size_t kSamples = 100; + const float kDecay = 0.5f; + const std::vector<std::vector<std::complex<float>>> test_data( + GenerateTestData(kFreqs, kSamples)); + PowerEstimator<std::complex<float>> power_estimator(kFreqs, kDecay); + EXPECT_EQ(0, power_estimator.power()[0]); + + // Makes sure Step is doing something. + power_estimator.Step(test_data[0].data()); + for (size_t i = 1; i < kSamples; ++i) { + power_estimator.Step(test_data[i].data()); + for (size_t j = 0; j < kFreqs; ++j) { + EXPECT_GE(power_estimator.power()[j], 0.f); + EXPECT_LE(power_estimator.power()[j], 1.f); + } + } +} + +// Tests gain applier. +TEST(IntelligibilityUtilsTest, TestGainApplier) { + const size_t kFreqs = 10; + const size_t kSamples = 100; + const float kChangeLimit = 0.1f; + GainApplier gain_applier(kFreqs, kChangeLimit); + const std::vector<std::vector<std::complex<float>>> in_data( + GenerateTestData(kFreqs, kSamples)); + std::vector<std::vector<std::complex<float>>> out_data( + GenerateTestData(kFreqs, kSamples)); + for (size_t i = 0; i < kSamples; ++i) { + gain_applier.Apply(in_data[i].data(), out_data[i].data()); + for (size_t j = 0; j < kFreqs; ++j) { + EXPECT_GT(out_data[i][j].real(), 0.f); + EXPECT_LT(out_data[i][j].real(), 1.f); + EXPECT_GT(out_data[i][j].imag(), 0.f); + EXPECT_LT(out_data[i][j].imag(), 1.f); + } + } +} + +} // namespace intelligibility + +} // namespace webrtc diff --git a/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc new file mode 100644 index 0000000000..b90449caa3 --- /dev/null +++ b/third_party/libwebrtc/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "common_audio/channel_buffer.h" +#include "common_audio/include/audio_util.h" +#include "common_audio/wav_file.h" +#include "modules/audio_processing/audio_buffer.h" +#include "modules/audio_processing/intelligibility/intelligibility_enhancer.h" +#include "modules/audio_processing/noise_suppression_impl.h" +#include "rtc_base/criticalsection.h" +#include "rtc_base/flags.h" + +using std::complex; + +namespace webrtc { +namespace { + +DEFINE_string(clear_file, "speech.wav", "Input file with clear speech."); +DEFINE_string(noise_file, "noise.wav", "Input file with noise data."); +DEFINE_string(out_file, "proc_enhanced.wav", "Enhanced output file."); +DEFINE_bool(help, false, "Print this message."); + +int int_main(int argc, char* argv[]) { + if (rtc::FlagList::SetFlagsFromCommandLine(&argc, argv, true)) { + return 1; + } + if (FLAG_help) { + rtc::FlagList::Print(nullptr, false); + return 0; + } + if (argc != 1) { + printf("\n\nInput files must be little-endian 16-bit signed raw PCM.\n"); + return 0; + } + + WavReader in_file(FLAG_clear_file); + WavReader noise_file(FLAG_noise_file); + WavWriter out_file(FLAG_out_file, in_file.sample_rate(), + in_file.num_channels()); + rtc::CriticalSection crit; + NoiseSuppressionImpl ns(&crit); + IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels(), 1u, + NoiseSuppressionImpl::num_noise_bins()); + ns.Initialize(noise_file.num_channels(), noise_file.sample_rate()); + ns.Enable(true); + const size_t in_samples = noise_file.sample_rate() / 100; + const size_t noise_samples = noise_file.sample_rate() / 100; + std::vector<float> in(in_samples * in_file.num_channels()); + std::vector<float> noise(noise_samples * noise_file.num_channels()); + ChannelBuffer<float> in_buf(in_samples, in_file.num_channels()); + ChannelBuffer<float> noise_buf(noise_samples, noise_file.num_channels()); + AudioBuffer capture_audio(noise_samples, noise_file.num_channels(), + noise_samples, noise_file.num_channels(), + noise_samples); + AudioBuffer render_audio(in_samples, in_file.num_channels(), in_samples, + in_file.num_channels(), in_samples); + StreamConfig noise_config(noise_file.sample_rate(), + noise_file.num_channels()); + StreamConfig in_config(in_file.sample_rate(), in_file.num_channels()); + while (in_file.ReadSamples(in.size(), in.data()) == in.size() && + noise_file.ReadSamples(noise.size(), noise.data()) == noise.size()) { + FloatS16ToFloat(noise.data(), noise.size(), noise.data()); + FloatS16ToFloat(in.data(), in.size(), in.data()); + Deinterleave(in.data(), in_buf.num_frames(), in_buf.num_channels(), + in_buf.channels()); + Deinterleave(noise.data(), noise_buf.num_frames(), noise_buf.num_channels(), + noise_buf.channels()); + capture_audio.CopyFrom(noise_buf.channels(), noise_config); + render_audio.CopyFrom(in_buf.channels(), in_config); + ns.AnalyzeCaptureAudio(&capture_audio); + ns.ProcessCaptureAudio(&capture_audio); + enh.SetCaptureNoiseEstimate(ns.NoiseEstimate(), 1); + enh.ProcessRenderAudio(&render_audio); + render_audio.CopyTo(in_config, in_buf.channels()); + Interleave(in_buf.channels(), in_buf.num_frames(), in_buf.num_channels(), + in.data()); + FloatToFloatS16(in.data(), in.size(), in.data()); + out_file.WriteSamples(in.data(), in.size()); + } + + return 0; +} + +} // namespace +} // namespace webrtc + +int main(int argc, char* argv[]) { + return webrtc::int_main(argc, argv); +} |