summaryrefslogtreecommitdiffstats
path: root/third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc')
-rw-r--r--third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc174
1 files changed, 174 insertions, 0 deletions
diff --git a/third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc b/third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc
new file mode 100644
index 0000000000..7bf3252116
--- /dev/null
+++ b/third_party/libwebrtc/modules/audio_processing/agc2/speech_level_estimator.cc
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/speech_level_estimator.h"
+
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/checks.h"
+#include "rtc_base/logging.h"
+#include "rtc_base/numerics/safe_minmax.h"
+
+namespace webrtc {
+namespace {
+
+float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
+ return rtc::SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
+}
+
+// Returns the initial speech level estimate needed to apply the initial gain.
+float GetInitialSpeechLevelEstimateDbfs(
+ const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
+ return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
+ config.initial_gain_db - config.headroom_db);
+}
+
+} // namespace
+
+bool SpeechLevelEstimator::LevelEstimatorState::operator==(
+ const SpeechLevelEstimator::LevelEstimatorState& b) const {
+ return time_to_confidence_ms == b.time_to_confidence_ms &&
+ level_dbfs.numerator == b.level_dbfs.numerator &&
+ level_dbfs.denominator == b.level_dbfs.denominator;
+}
+
+float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
+ RTC_DCHECK_NE(denominator, 0.f);
+ return numerator / denominator;
+}
+
+SpeechLevelEstimator::SpeechLevelEstimator(
+ ApmDataDumper* apm_data_dumper,
+ const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
+ int adjacent_speech_frames_threshold)
+ : apm_data_dumper_(apm_data_dumper),
+ initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
+ adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
+ level_dbfs_(initial_speech_level_dbfs_),
+ // TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume
+ // controller temporal dependency removed.
+ is_confident_(false) {
+ RTC_DCHECK(apm_data_dumper_);
+ RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
+ Reset();
+}
+
+void SpeechLevelEstimator::Update(float rms_dbfs,
+ float peak_dbfs,
+ float speech_probability) {
+ RTC_DCHECK_GT(rms_dbfs, -150.0f);
+ RTC_DCHECK_LT(rms_dbfs, 50.0f);
+ RTC_DCHECK_GT(peak_dbfs, -150.0f);
+ RTC_DCHECK_LT(peak_dbfs, 50.0f);
+ RTC_DCHECK_GE(speech_probability, 0.0f);
+ RTC_DCHECK_LE(speech_probability, 1.0f);
+ if (speech_probability < kVadConfidenceThreshold) {
+ // Not a speech frame.
+ if (adjacent_speech_frames_threshold_ > 1) {
+ // When two or more adjacent speech frames are required in order to update
+ // the state, we need to decide whether to discard or confirm the updates
+ // based on the speech sequence length.
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // First non-speech frame after a long enough sequence of speech frames.
+ // Update the reliable state.
+ reliable_state_ = preliminary_state_;
+ } else if (num_adjacent_speech_frames_ > 0) {
+ // First non-speech frame after a too short sequence of speech frames.
+ // Reset to the last reliable state.
+ preliminary_state_ = reliable_state_;
+ }
+ }
+ num_adjacent_speech_frames_ = 0;
+ } else {
+ // Speech frame observed.
+ num_adjacent_speech_frames_++;
+
+ // Update preliminary level estimate.
+ RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
+ const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
+ if (!buffer_is_full) {
+ preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
+ }
+ // Weighted average of levels with speech probability as weight.
+ RTC_DCHECK_GT(speech_probability, 0.0f);
+ const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
+ preliminary_state_.level_dbfs.numerator =
+ preliminary_state_.level_dbfs.numerator * leak_factor +
+ rms_dbfs * speech_probability;
+ preliminary_state_.level_dbfs.denominator =
+ preliminary_state_.level_dbfs.denominator * leak_factor +
+ speech_probability;
+
+ const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
+
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // `preliminary_state_` is now reliable. Update the last level estimation.
+ level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
+ }
+ }
+ UpdateIsConfident();
+ DumpDebugData();
+}
+
+void SpeechLevelEstimator::UpdateIsConfident() {
+ if (adjacent_speech_frames_threshold_ == 1) {
+ // Ignore `reliable_state_` when a single frame is enough to update the
+ // level estimate (because it is not used).
+ is_confident_ = preliminary_state_.time_to_confidence_ms == 0;
+ return;
+ }
+ // Once confident, it remains confident.
+ RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
+ preliminary_state_.time_to_confidence_ms == 0);
+ // During the first long enough speech sequence, `reliable_state_` must be
+ // ignored since `preliminary_state_` is used.
+ is_confident_ =
+ reliable_state_.time_to_confidence_ms == 0 ||
+ (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
+ preliminary_state_.time_to_confidence_ms == 0);
+}
+
+void SpeechLevelEstimator::Reset() {
+ ResetLevelEstimatorState(preliminary_state_);
+ ResetLevelEstimatorState(reliable_state_);
+ level_dbfs_ = initial_speech_level_dbfs_;
+ num_adjacent_speech_frames_ = 0;
+}
+
+void SpeechLevelEstimator::ResetLevelEstimatorState(
+ LevelEstimatorState& state) const {
+ state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
+ state.level_dbfs.numerator = initial_speech_level_dbfs_;
+ state.level_dbfs.denominator = 1.0f;
+}
+
+void SpeechLevelEstimator::DumpDebugData() const {
+ if (!apm_data_dumper_)
+ return;
+ apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_);
+ apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
+ num_adjacent_speech_frames_);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
+ preliminary_state_.level_dbfs.numerator);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
+ preliminary_state_.level_dbfs.denominator);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
+ preliminary_state_.time_to_confidence_ms);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
+ reliable_state_.time_to_confidence_ms);
+}
+
+} // namespace webrtc