third_party/libwebrtc/modules/audio_processing/agc2/saturation_protector.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

/*
 *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/agc2/saturation_protector.h"

#include <memory>

#include "modules/audio_processing/agc2/agc2_common.h"
#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/checks.h"
#include "rtc_base/numerics/safe_minmax.h"

namespace webrtc {
namespace {

constexpr int kPeakEnveloperSuperFrameLengthMs = 400;
constexpr float kMinMarginDb = 12.0f;
constexpr float kMaxMarginDb = 25.0f;
constexpr float kAttack = 0.9988493699365052f;
constexpr float kDecay = 0.9997697679981565f;

// Saturation protector state. Defined outside of `SaturationProtectorImpl` to
// implement check-point and restore ops.
struct SaturationProtectorState {
  bool operator==(const SaturationProtectorState& s) const {
    return headroom_db == s.headroom_db &&
           peak_delay_buffer == s.peak_delay_buffer &&
           max_peaks_dbfs == s.max_peaks_dbfs &&
           time_since_push_ms == s.time_since_push_ms;
  }
  inline bool operator!=(const SaturationProtectorState& s) const {
    return !(*this == s);
  }

  float headroom_db;
  SaturationProtectorBuffer peak_delay_buffer;
  float max_peaks_dbfs;
  int time_since_push_ms;  // Time since the last ring buffer push operation.
};

// Resets the saturation protector state.
void ResetSaturationProtectorState(float initial_headroom_db,
                                   SaturationProtectorState& state) {
  state.headroom_db = initial_headroom_db;
  state.peak_delay_buffer.Reset();
  state.max_peaks_dbfs = kMinLevelDbfs;
  state.time_since_push_ms = 0;
}

// Updates `state` by analyzing the estimated speech level `speech_level_dbfs`
// and the peak level `peak_dbfs` for an observed frame. `state` must not be
// modified without calling this function.
void UpdateSaturationProtectorState(float peak_dbfs,
                                    float speech_level_dbfs,
                                    SaturationProtectorState& state) {
  // Get the max peak over `kPeakEnveloperSuperFrameLengthMs` ms.
  state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, peak_dbfs);
  state.time_since_push_ms += kFrameDurationMs;
  if (rtc::SafeGt(state.time_since_push_ms, kPeakEnveloperSuperFrameLengthMs)) {
    // Push `max_peaks_dbfs` back into the ring buffer.
    state.peak_delay_buffer.PushBack(state.max_peaks_dbfs);
    // Reset.
    state.max_peaks_dbfs = kMinLevelDbfs;
    state.time_since_push_ms = 0;
  }

  // Update the headroom by comparing the estimated speech level and the delayed
  // max speech peak.
  const float delayed_peak_dbfs =
      state.peak_delay_buffer.Front().value_or(state.max_peaks_dbfs);
  const float difference_db = delayed_peak_dbfs - speech_level_dbfs;
  if (difference_db > state.headroom_db) {
    // Attack.
    state.headroom_db =
        state.headroom_db * kAttack + difference_db * (1.0f - kAttack);
  } else {
    // Decay.
    state.headroom_db =
        state.headroom_db * kDecay + difference_db * (1.0f - kDecay);
  }

  state.headroom_db =
      rtc::SafeClamp<float>(state.headroom_db, kMinMarginDb, kMaxMarginDb);
}

// Saturation protector which recommends a headroom based on the recent peaks.
class SaturationProtectorImpl : public SaturationProtector {
 public:
  explicit SaturationProtectorImpl(float initial_headroom_db,
                                   int adjacent_speech_frames_threshold,
                                   ApmDataDumper* apm_data_dumper)
      : apm_data_dumper_(apm_data_dumper),
        initial_headroom_db_(initial_headroom_db),
        adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold) {
    Reset();
  }
  SaturationProtectorImpl(const SaturationProtectorImpl&) = delete;
  SaturationProtectorImpl& operator=(const SaturationProtectorImpl&) = delete;
  ~SaturationProtectorImpl() = default;

  float HeadroomDb() override { return headroom_db_; }

  void Analyze(float speech_probability,
               float peak_dbfs,
               float speech_level_dbfs) override {
    if (speech_probability < kVadConfidenceThreshold) {
      // Not a speech frame.
      if (adjacent_speech_frames_threshold_ > 1) {
        // When two or more adjacent speech frames are required in order to
        // update the state, we need to decide whether to discard or confirm the
        // updates based on the speech sequence length.
        if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
          // First non-speech frame after a long enough sequence of speech
          // frames. Update the reliable state.
          reliable_state_ = preliminary_state_;
        } else if (num_adjacent_speech_frames_ > 0) {
          // First non-speech frame after a too short sequence of speech frames.
          // Reset to the last reliable state.
          preliminary_state_ = reliable_state_;
        }
      }
      num_adjacent_speech_frames_ = 0;
    } else {
      // Speech frame observed.
      num_adjacent_speech_frames_++;

      // Update preliminary level estimate.
      UpdateSaturationProtectorState(peak_dbfs, speech_level_dbfs,
                                     preliminary_state_);

      if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
        // `preliminary_state_` is now reliable. Update the headroom.
        headroom_db_ = preliminary_state_.headroom_db;
      }
    }
    DumpDebugData();
  }

  void Reset() override {
    num_adjacent_speech_frames_ = 0;
    headroom_db_ = initial_headroom_db_;
    ResetSaturationProtectorState(initial_headroom_db_, preliminary_state_);
    ResetSaturationProtectorState(initial_headroom_db_, reliable_state_);
  }

 private:
  void DumpDebugData() {
    apm_data_dumper_->DumpRaw(
        "agc2_saturation_protector_preliminary_max_peak_dbfs",
        preliminary_state_.max_peaks_dbfs);
    apm_data_dumper_->DumpRaw(
        "agc2_saturation_protector_reliable_max_peak_dbfs",
        reliable_state_.max_peaks_dbfs);
  }

  ApmDataDumper* const apm_data_dumper_;
  const float initial_headroom_db_;
  const int adjacent_speech_frames_threshold_;
  int num_adjacent_speech_frames_;
  float headroom_db_;
  SaturationProtectorState preliminary_state_;
  SaturationProtectorState reliable_state_;
};

}  // namespace

std::unique_ptr<SaturationProtector> CreateSaturationProtector(
    float initial_headroom_db,
    int adjacent_speech_frames_threshold,
    ApmDataDumper* apm_data_dumper) {
  return std::make_unique<SaturationProtectorImpl>(
      initial_headroom_db, adjacent_speech_frames_threshold, apm_data_dumper);
}

}  // namespace webrtc