diff options
Diffstat (limited to 'third_party/libwebrtc/modules/audio_processing/agc2/rnn_vad/spectral_features.cc')
-rw-r--r-- | third_party/libwebrtc/modules/audio_processing/agc2/rnn_vad/spectral_features.cc | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/third_party/libwebrtc/modules/audio_processing/agc2/rnn_vad/spectral_features.cc b/third_party/libwebrtc/modules/audio_processing/agc2/rnn_vad/spectral_features.cc new file mode 100644 index 0000000000..96086babb6 --- /dev/null +++ b/third_party/libwebrtc/modules/audio_processing/agc2/rnn_vad/spectral_features.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/agc2/rnn_vad/spectral_features.h" + +#include <algorithm> +#include <cmath> +#include <limits> +#include <numeric> + +#include "rtc_base/checks.h" +#include "rtc_base/numerics/safe_compare.h" + +namespace webrtc { +namespace rnn_vad { +namespace { + +constexpr float kSilenceThreshold = 0.04f; + +// Computes the new cepstral difference stats and pushes them into the passed +// symmetric matrix buffer. +void UpdateCepstralDifferenceStats( + rtc::ArrayView<const float, kNumBands> new_cepstral_coeffs, + const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf, + SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) { + RTC_DCHECK(sym_matrix_buf); + // Compute the new cepstral distance stats. + std::array<float, kCepstralCoeffsHistorySize - 1> distances; + for (int i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) { + const int delay = i + 1; + auto old_cepstral_coeffs = ring_buf.GetArrayView(delay); + distances[i] = 0.f; + for (int k = 0; k < kNumBands; ++k) { + const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k]; + distances[i] += c * c; + } + } + // Push the new spectral distance stats into the symmetric matrix buffer. + sym_matrix_buf->Push(distances); +} + +// Computes the first half of the Vorbis window. +std::array<float, kFrameSize20ms24kHz / 2> ComputeScaledHalfVorbisWindow( + float scaling = 1.f) { + constexpr int kHalfSize = kFrameSize20ms24kHz / 2; + std::array<float, kHalfSize> half_window{}; + for (int i = 0; i < kHalfSize; ++i) { + half_window[i] = + scaling * + std::sin(0.5 * kPi * std::sin(0.5 * kPi * (i + 0.5) / kHalfSize) * + std::sin(0.5 * kPi * (i + 0.5) / kHalfSize)); + } + return half_window; +} + +// Computes the forward FFT on a 20 ms frame to which a given window function is +// applied. The Fourier coefficient corresponding to the Nyquist frequency is +// set to zero (it is never used and this allows to simplify the code). +void ComputeWindowedForwardFft( + rtc::ArrayView<const float, kFrameSize20ms24kHz> frame, + const std::array<float, kFrameSize20ms24kHz / 2>& half_window, + Pffft::FloatBuffer* fft_input_buffer, + Pffft::FloatBuffer* fft_output_buffer, + Pffft* fft) { + RTC_DCHECK_EQ(frame.size(), 2 * half_window.size()); + // Apply windowing. + auto in = fft_input_buffer->GetView(); + for (int i = 0, j = kFrameSize20ms24kHz - 1; + rtc::SafeLt(i, half_window.size()); ++i, --j) { + in[i] = frame[i] * half_window[i]; + in[j] = frame[j] * half_window[i]; + } + fft->ForwardTransform(*fft_input_buffer, fft_output_buffer, /*ordered=*/true); + // Set the Nyquist frequency coefficient to zero. + auto out = fft_output_buffer->GetView(); + out[1] = 0.f; +} + +} // namespace + +SpectralFeaturesExtractor::SpectralFeaturesExtractor() + : half_window_(ComputeScaledHalfVorbisWindow( + 1.f / static_cast<float>(kFrameSize20ms24kHz))), + fft_(kFrameSize20ms24kHz, Pffft::FftType::kReal), + fft_buffer_(fft_.CreateBuffer()), + reference_frame_fft_(fft_.CreateBuffer()), + lagged_frame_fft_(fft_.CreateBuffer()), + dct_table_(ComputeDctTable()) {} + +SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default; + +void SpectralFeaturesExtractor::Reset() { + cepstral_coeffs_ring_buf_.Reset(); + cepstral_diffs_buf_.Reset(); +} + +bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures( + rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame, + rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame, + rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum, + rtc::ArrayView<float, kNumLowerBands> average, + rtc::ArrayView<float, kNumLowerBands> first_derivative, + rtc::ArrayView<float, kNumLowerBands> second_derivative, + rtc::ArrayView<float, kNumLowerBands> bands_cross_corr, + float* variability) { + // Compute the Opus band energies for the reference frame. + ComputeWindowedForwardFft(reference_frame, half_window_, fft_buffer_.get(), + reference_frame_fft_.get(), &fft_); + spectral_correlator_.ComputeAutoCorrelation( + reference_frame_fft_->GetConstView(), reference_frame_bands_energy_); + // Check if the reference frame has silence. + const float tot_energy = + std::accumulate(reference_frame_bands_energy_.begin(), + reference_frame_bands_energy_.end(), 0.f); + if (tot_energy < kSilenceThreshold) { + return true; + } + // Compute the Opus band energies for the lagged frame. + ComputeWindowedForwardFft(lagged_frame, half_window_, fft_buffer_.get(), + lagged_frame_fft_.get(), &fft_); + spectral_correlator_.ComputeAutoCorrelation(lagged_frame_fft_->GetConstView(), + lagged_frame_bands_energy_); + // Log of the band energies for the reference frame. + std::array<float, kNumBands> log_bands_energy; + ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_, + log_bands_energy); + // Reference frame cepstrum. + std::array<float, kNumBands> cepstrum; + ComputeDct(log_bands_energy, dct_table_, cepstrum); + // Ad-hoc correction terms for the first two cepstral coefficients. + cepstrum[0] -= 12.f; + cepstrum[1] -= 4.f; + // Update the ring buffer and the cepstral difference stats. + cepstral_coeffs_ring_buf_.Push(cepstrum); + UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_, + &cepstral_diffs_buf_); + // Write the higher bands cepstral coefficients. + RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size()); + std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(), + higher_bands_cepstrum.begin()); + // Compute and write remaining features. + ComputeAvgAndDerivatives(average, first_derivative, second_derivative); + ComputeNormalizedCepstralCorrelation(bands_cross_corr); + RTC_DCHECK(variability); + *variability = ComputeVariability(); + return false; +} + +void SpectralFeaturesExtractor::ComputeAvgAndDerivatives( + rtc::ArrayView<float, kNumLowerBands> average, + rtc::ArrayView<float, kNumLowerBands> first_derivative, + rtc::ArrayView<float, kNumLowerBands> second_derivative) const { + auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0); + auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1); + auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2); + RTC_DCHECK_EQ(average.size(), first_derivative.size()); + RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size()); + RTC_DCHECK_LE(average.size(), curr.size()); + for (int i = 0; rtc::SafeLt(i, average.size()); ++i) { + // Average, kernel: [1, 1, 1]. + average[i] = curr[i] + prev1[i] + prev2[i]; + // First derivative, kernel: [1, 0, - 1]. + first_derivative[i] = curr[i] - prev2[i]; + // Second derivative, Laplacian kernel: [1, -2, 1]. + second_derivative[i] = curr[i] - 2 * prev1[i] + prev2[i]; + } +} + +void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation( + rtc::ArrayView<float, kNumLowerBands> bands_cross_corr) { + spectral_correlator_.ComputeCrossCorrelation( + reference_frame_fft_->GetConstView(), lagged_frame_fft_->GetConstView(), + bands_cross_corr_); + // Normalize. + for (int i = 0; rtc::SafeLt(i, bands_cross_corr_.size()); ++i) { + bands_cross_corr_[i] = + bands_cross_corr_[i] / + std::sqrt(0.001f + reference_frame_bands_energy_[i] * + lagged_frame_bands_energy_[i]); + } + // Cepstrum. + ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr); + // Ad-hoc correction terms for the first two cepstral coefficients. + bands_cross_corr[0] -= 1.3f; + bands_cross_corr[1] -= 0.9f; +} + +float SpectralFeaturesExtractor::ComputeVariability() const { + // Compute cepstral variability score. + float variability = 0.f; + for (int delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) { + float min_dist = std::numeric_limits<float>::max(); + for (int delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) { + if (delay1 == delay2) // The distance would be 0. + continue; + min_dist = + std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2)); + } + variability += min_dist; + } + // Normalize (based on training set stats). + // TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction. + return variability / kCepstralCoeffsHistorySize - 2.1f; +} + +} // namespace rnn_vad +} // namespace webrtc |