/* * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "modules/audio_processing/vad/vad_audio_proc.h" #include #include #include #include "common_audio/third_party/ooura/fft_size_256/fft4g.h" #include "modules/audio_processing/vad/pitch_internal.h" #include "modules/audio_processing/vad/pole_zero_filter.h" #include "modules/audio_processing/vad/vad_audio_proc_internal.h" #include "rtc_base/checks.h" extern "C" { #include "modules/audio_coding/codecs/isac/main/source/filter_functions.h" #include "modules/audio_coding/codecs/isac/main/source/isac_vad.h" #include "modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" #include "modules/audio_coding/codecs/isac/main/source/structs.h" } namespace webrtc { // The following structures are declared anonymous in iSAC's structs.h. To // forward declare them, we use this derived class trick. struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; static constexpr float kFrequencyResolution = kSampleRateHz / static_cast(VadAudioProc::kDftSize); static constexpr int kSilenceRms = 5; // TODO(turajs): Make a Create or Init for VadAudioProc. VadAudioProc::VadAudioProc() : audio_buffer_(), num_buffer_samples_(kNumPastSignalSamples), log_old_gain_(-2), old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). pitch_analysis_handle_(new PitchAnalysisStruct), pre_filter_handle_(new PreFiltBankstr), high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) { static_assert(kNumPastSignalSamples + kNumSubframeSamples == sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), "lpc analysis window incorrect size"); static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), "correlation weight incorrect size"); // TODO(turajs): Are we doing too much in the constructor? float data[kDftSize]; // Make FFT to initialize. ip_[0] = 0; WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); // TODO(turajs): Need to initialize high-pass filter. // Initialize iSAC components. WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); } VadAudioProc::~VadAudioProc() {} void VadAudioProc::ResetBuffer() { memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], sizeof(audio_buffer_[0]) * kNumPastSignalSamples); num_buffer_samples_ = kNumPastSignalSamples; } int VadAudioProc::ExtractFeatures(const int16_t* frame, size_t length, AudioFeatures* features) { features->num_frames = 0; if (length != kNumSubframeSamples) { return -1; } // High-pass filter to remove the DC component and very low frequency content. // We have experienced that this high-pass filtering improves voice/non-voiced // classification. if (high_pass_filter_->Filter(frame, kNumSubframeSamples, &audio_buffer_[num_buffer_samples_]) != 0) { return -1; } num_buffer_samples_ += kNumSubframeSamples; if (num_buffer_samples_ < kBufferLength) { return 0; } RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength); features->num_frames = kNum10msSubframes; features->silence = false; Rms(features->rms, kMaxNumFrames); for (size_t i = 0; i < kNum10msSubframes; ++i) { if (features->rms[i] < kSilenceRms) { // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. // Bail out here instead. features->silence = true; ResetBuffer(); return 0; } } PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, kMaxNumFrames); FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); ResetBuffer(); return 0; } // Computes |kLpcOrder + 1| correlation coefficients. void VadAudioProc::SubframeCorrelation(double* corr, size_t length_corr, size_t subframe_index) { RTC_DCHECK_GE(length_corr, kLpcOrder + 1); double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; size_t buffer_index = subframe_index * kNumSubframeSamples; for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); } // Compute `kNum10msSubframes` sets of LPC coefficients, one per 10 ms input. // The analysis window is 15 ms long and it is centered on the first half of // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the // first half of each 10 ms subframe. void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1)); double corr[kLpcOrder + 1]; double reflec_coeff[kLpcOrder]; for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; i++, offset_lpc += kLpcOrder + 1) { SubframeCorrelation(corr, kLpcOrder + 1, i); corr[0] *= 1.0001; // This makes Lev-Durb a bit more stable. for (size_t k = 0; k < kLpcOrder + 1; k++) { corr[k] *= kCorrWeight[k]; } WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); } } // Fit a second order curve to these 3 points and find the location of the // extremum. The points are inverted before curve fitting. static float QuadraticInterpolation(float prev_val, float curr_val, float next_val) { // Doing the interpolation in |1 / A(z)|^2. float fractional_index = 0; next_val = 1.0f / next_val; prev_val = 1.0f / prev_val; curr_val = 1.0f / curr_val; fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); RTC_DCHECK_LT(fabs(fractional_index), 1); return fractional_index; } // 1 / A(z), where A(z) is defined by `lpc` is a model of the spectral envelope // of the input signal. The local maximum of the spectral envelope corresponds // with the local minimum of A(z). It saves complexity, as we save one // inversion. Furthermore, we find the first local maximum of magnitude squared, // to save on one square root. void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, size_t length_f_peak) { RTC_DCHECK_GE(length_f_peak, kNum10msSubframes); double lpc[kNum10msSubframes * (kLpcOrder + 1)]; // For all sub-frames. GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); const size_t kNumDftCoefficients = kDftSize / 2 + 1; float data[kDftSize]; for (size_t i = 0; i < kNum10msSubframes; i++) { // Convert to float with zero pad. memset(data, 0, sizeof(data)); for (size_t n = 0; n < kLpcOrder + 1; n++) { data[n] = static_cast(lpc[i * (kLpcOrder + 1) + n]); } // Transform to frequency domain. WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); size_t index_peak = 0; float prev_magn_sqr = data[0] * data[0]; float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; float next_magn_sqr; bool found_peak = false; for (size_t n = 2; n < kNumDftCoefficients - 1; n++) { next_magn_sqr = data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { found_peak = true; index_peak = n - 1; break; } prev_magn_sqr = curr_magn_sqr; curr_magn_sqr = next_magn_sqr; } float fractional_index = 0; if (!found_peak) { // Checking if |kNumDftCoefficients - 1| is the local minimum. next_magn_sqr = data[1] * data[1]; if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { index_peak = kNumDftCoefficients - 1; } } else { // A peak is found, do a simple quadratic interpolation to get a more // accurate estimate of the peak location. fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); } f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; } } // Using iSAC functions to estimate pitch gains & lags. void VadAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz, size_t length) { // TODO(turajs): This can be "imported" from iSAC & and the next two // constants. RTC_DCHECK_GE(length, kNum10msSubframes); const int kNumPitchSubframes = 4; double gains[kNumPitchSubframes]; double lags[kNumPitchSubframes]; const int kNumSubbandFrameSamples = 240; const int kNumLookaheadSamples = 24; float lower[kNumSubbandFrameSamples]; float upper[kNumSubbandFrameSamples]; double lower_lookahead[kNumSubbandFrameSamples]; double upper_lookahead[kNumSubbandFrameSamples]; double lower_lookahead_pre_filter[kNumSubbandFrameSamples + kNumLookaheadSamples]; // Split signal to lower and upper bands WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, upper, lower_lookahead, upper_lookahead, pre_filter_handle_.get()); WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, pitch_analysis_handle_.get(), lags, gains); // Lags are computed on lower-band signal with sampling rate half of the // input signal. GetSubframesPitchParameters( kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); } void VadAudioProc::Rms(double* rms, size_t length_rms) { RTC_DCHECK_GE(length_rms, kNum10msSubframes); size_t offset = kNumPastSignalSamples; for (size_t i = 0; i < kNum10msSubframes; i++) { rms[i] = 0; for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; rms[i] = sqrt(rms[i] / kNumSubframeSamples); } } } // namespace webrtc