diff options
Diffstat (limited to 'third_party/libwebrtc/common_audio/vad/vad_filterbank.c')
-rw-r--r-- | third_party/libwebrtc/common_audio/vad/vad_filterbank.c | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/third_party/libwebrtc/common_audio/vad/vad_filterbank.c b/third_party/libwebrtc/common_audio/vad/vad_filterbank.c new file mode 100644 index 0000000000..aff63f79cd --- /dev/null +++ b/third_party/libwebrtc/common_audio/vad/vad_filterbank.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "common_audio/vad/vad_filterbank.h" + +#include "rtc_base/checks.h" +#include "common_audio/signal_processing/include/signal_processing_library.h" + +// Constants used in LogOfEnergy(). +static const int16_t kLogConst = 24660; // 160*log10(2) in Q9. +static const int16_t kLogEnergyIntPart = 14336; // 14 in Q10 + +// Coefficients used by HighPassFilter, Q14. +static const int16_t kHpZeroCoefs[3] = { 6631, -13262, 6631 }; +static const int16_t kHpPoleCoefs[3] = { 16384, -7756, 5620 }; + +// Allpass filter coefficients, upper and lower, in Q15. +// Upper: 0.64, Lower: 0.17 +static const int16_t kAllPassCoefsQ15[2] = { 20972, 5571 }; + +// Adjustment for division with two in SplitFilter. +static const int16_t kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 }; + +// High pass filtering, with a cut-off frequency at 80 Hz, if the `data_in` is +// sampled at 500 Hz. +// +// - data_in [i] : Input audio data sampled at 500 Hz. +// - data_length [i] : Length of input and output data. +// - filter_state [i/o] : State of the filter. +// - data_out [o] : Output audio data in the frequency interval +// 80 - 250 Hz. +static void HighPassFilter(const int16_t* data_in, size_t data_length, + int16_t* filter_state, int16_t* data_out) { + size_t i; + const int16_t* in_ptr = data_in; + int16_t* out_ptr = data_out; + int32_t tmp32 = 0; + + + // The sum of the absolute values of the impulse response: + // The zero/pole-filter has a max amplification of a single sample of: 1.4546 + // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194 + // The all-zero section has a max amplification of a single sample of: 1.6189 + // Impulse response: 0.4047 -0.8094 0.4047 0 0 0 + // The all-pole section has a max amplification of a single sample of: 1.9931 + // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532 + + for (i = 0; i < data_length; i++) { + // All-zero section (filter coefficients in Q14). + tmp32 = kHpZeroCoefs[0] * *in_ptr; + tmp32 += kHpZeroCoefs[1] * filter_state[0]; + tmp32 += kHpZeroCoefs[2] * filter_state[1]; + filter_state[1] = filter_state[0]; + filter_state[0] = *in_ptr++; + + // All-pole section (filter coefficients in Q14). + tmp32 -= kHpPoleCoefs[1] * filter_state[2]; + tmp32 -= kHpPoleCoefs[2] * filter_state[3]; + filter_state[3] = filter_state[2]; + filter_state[2] = (int16_t) (tmp32 >> 14); + *out_ptr++ = filter_state[2]; + } +} + +// All pass filtering of `data_in`, used before splitting the signal into two +// frequency bands (low pass vs high pass). +// Note that `data_in` and `data_out` can NOT correspond to the same address. +// +// - data_in [i] : Input audio signal given in Q0. +// - data_length [i] : Length of input and output data. +// - filter_coefficient [i] : Given in Q15. +// - filter_state [i/o] : State of the filter given in Q(-1). +// - data_out [o] : Output audio signal given in Q(-1). +static void AllPassFilter(const int16_t* data_in, size_t data_length, + int16_t filter_coefficient, int16_t* filter_state, + int16_t* data_out) { + // The filter can only cause overflow (in the w16 output variable) + // if more than 4 consecutive input numbers are of maximum value and + // has the the same sign as the impulse responses first taps. + // First 6 taps of the impulse response: + // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990 + + size_t i; + int16_t tmp16 = 0; + int32_t tmp32 = 0; + int32_t state32 = ((int32_t) (*filter_state) * (1 << 16)); // Q15 + + for (i = 0; i < data_length; i++) { + tmp32 = state32 + filter_coefficient * *data_in; + tmp16 = (int16_t) (tmp32 >> 16); // Q(-1) + *data_out++ = tmp16; + state32 = (*data_in * (1 << 14)) - filter_coefficient * tmp16; // Q14 + state32 *= 2; // Q15. + data_in += 2; + } + + *filter_state = (int16_t) (state32 >> 16); // Q(-1) +} + +// Splits `data_in` into `hp_data_out` and `lp_data_out` corresponding to +// an upper (high pass) part and a lower (low pass) part respectively. +// +// - data_in [i] : Input audio data to be split into two frequency bands. +// - data_length [i] : Length of `data_in`. +// - upper_state [i/o] : State of the upper filter, given in Q(-1). +// - lower_state [i/o] : State of the lower filter, given in Q(-1). +// - hp_data_out [o] : Output audio data of the upper half of the spectrum. +// The length is `data_length` / 2. +// - lp_data_out [o] : Output audio data of the lower half of the spectrum. +// The length is `data_length` / 2. +static void SplitFilter(const int16_t* data_in, size_t data_length, + int16_t* upper_state, int16_t* lower_state, + int16_t* hp_data_out, int16_t* lp_data_out) { + size_t i; + size_t half_length = data_length >> 1; // Downsampling by 2. + int16_t tmp_out; + + // All-pass filtering upper branch. + AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state, + hp_data_out); + + // All-pass filtering lower branch. + AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state, + lp_data_out); + + // Make LP and HP signals. + for (i = 0; i < half_length; i++) { + tmp_out = *hp_data_out; + *hp_data_out++ -= *lp_data_out; + *lp_data_out++ += tmp_out; + } +} + +// Calculates the energy of `data_in` in dB, and also updates an overall +// `total_energy` if necessary. +// +// - data_in [i] : Input audio data for energy calculation. +// - data_length [i] : Length of input data. +// - offset [i] : Offset value added to `log_energy`. +// - total_energy [i/o] : An external energy updated with the energy of +// `data_in`. +// NOTE: `total_energy` is only updated if +// `total_energy` <= `kMinEnergy`. +// - log_energy [o] : 10 * log10("energy of `data_in`") given in Q4. +static void LogOfEnergy(const int16_t* data_in, size_t data_length, + int16_t offset, int16_t* total_energy, + int16_t* log_energy) { + // `tot_rshifts` accumulates the number of right shifts performed on `energy`. + int tot_rshifts = 0; + // The `energy` will be normalized to 15 bits. We use unsigned integer because + // we eventually will mask out the fractional part. + uint32_t energy = 0; + + RTC_DCHECK(data_in); + RTC_DCHECK_GT(data_length, 0); + + energy = (uint32_t) WebRtcSpl_Energy((int16_t*) data_in, data_length, + &tot_rshifts); + + if (energy != 0) { + // By construction, normalizing to 15 bits is equivalent with 17 leading + // zeros of an unsigned 32 bit value. + int normalizing_rshifts = 17 - WebRtcSpl_NormU32(energy); + // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is + // (14 << 10), which is what we initialize `log2_energy` with. For a more + // detailed derivations, see below. + int16_t log2_energy = kLogEnergyIntPart; + + tot_rshifts += normalizing_rshifts; + // Normalize `energy` to 15 bits. + // `tot_rshifts` is now the total number of right shifts performed on + // `energy` after normalization. This means that `energy` is in + // Q(-tot_rshifts). + if (normalizing_rshifts < 0) { + energy <<= -normalizing_rshifts; + } else { + energy >>= normalizing_rshifts; + } + + // Calculate the energy of `data_in` in dB, in Q4. + // + // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") = + // 160 * log10(`energy` * 2^`tot_rshifts`) = + // 160 * log10(2) * log2(`energy` * 2^`tot_rshifts`) = + // 160 * log10(2) * (log2(`energy`) + log2(2^`tot_rshifts`)) = + // (160 * log10(2)) * (log2(`energy`) + `tot_rshifts`) = + // `kLogConst` * (`log2_energy` + `tot_rshifts`) + // + // We know by construction that `energy` is normalized to 15 bits. Hence, + // `energy` = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15. + // Further, we'd like `log2_energy` in Q10 + // log2(`energy`) in Q10 = 2^10 * log2(2^14 + frac_Q15) = + // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) = + // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~= + // (14 << 10) + 2^10 * (frac_Q15 * 2^-14) = + // (14 << 10) + (frac_Q15 * 2^-4) = (14 << 10) + (frac_Q15 >> 4) + // + // Note that frac_Q15 = (`energy` & 0x00003FFF) + + // Calculate and add the fractional part to `log2_energy`. + log2_energy += (int16_t) ((energy & 0x00003FFF) >> 4); + + // `kLogConst` is in Q9, `log2_energy` in Q10 and `tot_rshifts` in Q0. + // Note that we in our derivation above have accounted for an output in Q4. + *log_energy = (int16_t)(((kLogConst * log2_energy) >> 19) + + ((tot_rshifts * kLogConst) >> 9)); + + if (*log_energy < 0) { + *log_energy = 0; + } + } else { + *log_energy = offset; + return; + } + + *log_energy += offset; + + // Update the approximate `total_energy` with the energy of `data_in`, if + // `total_energy` has not exceeded `kMinEnergy`. `total_energy` is used as an + // energy indicator in WebRtcVad_GmmProbability() in vad_core.c. + if (*total_energy <= kMinEnergy) { + if (tot_rshifts >= 0) { + // We know by construction that the `energy` > `kMinEnergy` in Q0, so add + // an arbitrary value such that `total_energy` exceeds `kMinEnergy`. + *total_energy += kMinEnergy + 1; + } else { + // By construction `energy` is represented by 15 bits, hence any number of + // right shifted `energy` will fit in an int16_t. In addition, adding the + // value to `total_energy` is wrap around safe as long as + // `kMinEnergy` < 8192. + *total_energy += (int16_t) (energy >> -tot_rshifts); // Q0. + } + } +} + +int16_t WebRtcVad_CalculateFeatures(VadInstT* self, const int16_t* data_in, + size_t data_length, int16_t* features) { + int16_t total_energy = 0; + // We expect `data_length` to be 80, 160 or 240 samples, which corresponds to + // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will + // have at most 120 samples after the first split and at most 60 samples after + // the second split. + int16_t hp_120[120], lp_120[120]; + int16_t hp_60[60], lp_60[60]; + const size_t half_data_length = data_length >> 1; + size_t length = half_data_length; // `data_length` / 2, corresponds to + // bandwidth = 2000 Hz after downsampling. + + // Initialize variables for the first SplitFilter(). + int frequency_band = 0; + const int16_t* in_ptr = data_in; // [0 - 4000] Hz. + int16_t* hp_out_ptr = hp_120; // [2000 - 4000] Hz. + int16_t* lp_out_ptr = lp_120; // [0 - 2000] Hz. + + RTC_DCHECK_LE(data_length, 240); + RTC_DCHECK_LT(4, kNumChannels - 1); // Checking maximum `frequency_band`. + + // Split at 2000 Hz and downsample. + SplitFilter(in_ptr, data_length, &self->upper_state[frequency_band], + &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample. + frequency_band = 1; + in_ptr = hp_120; // [2000 - 4000] Hz. + hp_out_ptr = hp_60; // [3000 - 4000] Hz. + lp_out_ptr = lp_60; // [2000 - 3000] Hz. + SplitFilter(in_ptr, length, &self->upper_state[frequency_band], + &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 3000 Hz - 4000 Hz. + length >>= 1; // `data_length` / 4 <=> bandwidth = 1000 Hz. + + LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]); + + // Energy in 2000 Hz - 3000 Hz. + LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]); + + // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample. + frequency_band = 2; + in_ptr = lp_120; // [0 - 2000] Hz. + hp_out_ptr = hp_60; // [1000 - 2000] Hz. + lp_out_ptr = lp_60; // [0 - 1000] Hz. + length = half_data_length; // `data_length` / 2 <=> bandwidth = 2000 Hz. + SplitFilter(in_ptr, length, &self->upper_state[frequency_band], + &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 1000 Hz - 2000 Hz. + length >>= 1; // `data_length` / 4 <=> bandwidth = 1000 Hz. + LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]); + + // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample. + frequency_band = 3; + in_ptr = lp_60; // [0 - 1000] Hz. + hp_out_ptr = hp_120; // [500 - 1000] Hz. + lp_out_ptr = lp_120; // [0 - 500] Hz. + SplitFilter(in_ptr, length, &self->upper_state[frequency_band], + &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 500 Hz - 1000 Hz. + length >>= 1; // `data_length` / 8 <=> bandwidth = 500 Hz. + LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]); + + // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample. + frequency_band = 4; + in_ptr = lp_120; // [0 - 500] Hz. + hp_out_ptr = hp_60; // [250 - 500] Hz. + lp_out_ptr = lp_60; // [0 - 250] Hz. + SplitFilter(in_ptr, length, &self->upper_state[frequency_band], + &self->lower_state[frequency_band], hp_out_ptr, lp_out_ptr); + + // Energy in 250 Hz - 500 Hz. + length >>= 1; // `data_length` / 16 <=> bandwidth = 250 Hz. + LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]); + + // Remove 0 Hz - 80 Hz, by high pass filtering the lower band. + HighPassFilter(lp_60, length, self->hp_filter_state, hp_120); + + // Energy in 80 Hz - 250 Hz. + LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]); + + return total_energy; +} |