diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/libwebrtc/modules/audio_coding/codecs/cng | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/libwebrtc/modules/audio_coding/codecs/cng')
6 files changed, 1678 insertions, 0 deletions
diff --git a/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc b/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc new file mode 100644 index 0000000000..7546ac178f --- /dev/null +++ b/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_coding/codecs/cng/audio_encoder_cng.h" + +#include <cstdint> +#include <memory> +#include <utility> + +#include "absl/types/optional.h" +#include "api/units/time_delta.h" +#include "modules/audio_coding/codecs/cng/webrtc_cng.h" +#include "rtc_base/checks.h" + +namespace webrtc { + +namespace { + +const int kMaxFrameSizeMs = 60; + +class AudioEncoderCng final : public AudioEncoder { + public: + explicit AudioEncoderCng(AudioEncoderCngConfig&& config); + ~AudioEncoderCng() override; + + // Not copyable or moveable. + AudioEncoderCng(const AudioEncoderCng&) = delete; + AudioEncoderCng(AudioEncoderCng&&) = delete; + AudioEncoderCng& operator=(const AudioEncoderCng&) = delete; + AudioEncoderCng& operator=(AudioEncoderCng&&) = delete; + + int SampleRateHz() const override; + size_t NumChannels() const override; + int RtpTimestampRateHz() const override; + size_t Num10MsFramesInNextPacket() const override; + size_t Max10MsFramesInAPacket() const override; + int GetTargetBitrate() const override; + EncodedInfo EncodeImpl(uint32_t rtp_timestamp, + rtc::ArrayView<const int16_t> audio, + rtc::Buffer* encoded) override; + void Reset() override; + bool SetFec(bool enable) override; + bool SetDtx(bool enable) override; + bool SetApplication(Application application) override; + void SetMaxPlaybackRate(int frequency_hz) override; + rtc::ArrayView<std::unique_ptr<AudioEncoder>> ReclaimContainedEncoders() + override; + void OnReceivedUplinkPacketLossFraction( + float uplink_packet_loss_fraction) override; + void OnReceivedUplinkBandwidth( + int target_audio_bitrate_bps, + absl::optional<int64_t> bwe_period_ms) override; + absl::optional<std::pair<TimeDelta, TimeDelta>> GetFrameLengthRange() + const override; + + private: + EncodedInfo EncodePassive(size_t frames_to_encode, rtc::Buffer* encoded); + EncodedInfo EncodeActive(size_t frames_to_encode, rtc::Buffer* encoded); + size_t SamplesPer10msFrame() const; + + std::unique_ptr<AudioEncoder> speech_encoder_; + const int cng_payload_type_; + const int num_cng_coefficients_; + const int sid_frame_interval_ms_; + std::vector<int16_t> speech_buffer_; + std::vector<uint32_t> rtp_timestamps_; + bool last_frame_active_; + std::unique_ptr<Vad> vad_; + std::unique_ptr<ComfortNoiseEncoder> cng_encoder_; +}; + +AudioEncoderCng::AudioEncoderCng(AudioEncoderCngConfig&& config) + : speech_encoder_((static_cast<void>([&] { + RTC_CHECK(config.IsOk()) << "Invalid configuration."; + }()), + std::move(config.speech_encoder))), + cng_payload_type_(config.payload_type), + num_cng_coefficients_(config.num_cng_coefficients), + sid_frame_interval_ms_(config.sid_frame_interval_ms), + last_frame_active_(true), + vad_(config.vad ? std::unique_ptr<Vad>(config.vad) + : CreateVad(config.vad_mode)), + cng_encoder_(new ComfortNoiseEncoder(SampleRateHz(), + sid_frame_interval_ms_, + num_cng_coefficients_)) {} + +AudioEncoderCng::~AudioEncoderCng() = default; + +int AudioEncoderCng::SampleRateHz() const { + return speech_encoder_->SampleRateHz(); +} + +size_t AudioEncoderCng::NumChannels() const { + return 1; +} + +int AudioEncoderCng::RtpTimestampRateHz() const { + return speech_encoder_->RtpTimestampRateHz(); +} + +size_t AudioEncoderCng::Num10MsFramesInNextPacket() const { + return speech_encoder_->Num10MsFramesInNextPacket(); +} + +size_t AudioEncoderCng::Max10MsFramesInAPacket() const { + return speech_encoder_->Max10MsFramesInAPacket(); +} + +int AudioEncoderCng::GetTargetBitrate() const { + return speech_encoder_->GetTargetBitrate(); +} + +AudioEncoder::EncodedInfo AudioEncoderCng::EncodeImpl( + uint32_t rtp_timestamp, + rtc::ArrayView<const int16_t> audio, + rtc::Buffer* encoded) { + const size_t samples_per_10ms_frame = SamplesPer10msFrame(); + RTC_CHECK_EQ(speech_buffer_.size(), + rtp_timestamps_.size() * samples_per_10ms_frame); + rtp_timestamps_.push_back(rtp_timestamp); + RTC_DCHECK_EQ(samples_per_10ms_frame, audio.size()); + speech_buffer_.insert(speech_buffer_.end(), audio.cbegin(), audio.cend()); + const size_t frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket(); + if (rtp_timestamps_.size() < frames_to_encode) { + return EncodedInfo(); + } + RTC_CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs) + << "Frame size cannot be larger than " << kMaxFrameSizeMs + << " ms when using VAD/CNG."; + + // Group several 10 ms blocks per VAD call. Call VAD once or twice using the + // following split sizes: + // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms; + // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms. + size_t blocks_in_first_vad_call = + (frames_to_encode > 3 ? 3 : frames_to_encode); + if (frames_to_encode == 4) + blocks_in_first_vad_call = 2; + RTC_CHECK_GE(frames_to_encode, blocks_in_first_vad_call); + const size_t blocks_in_second_vad_call = + frames_to_encode - blocks_in_first_vad_call; + + // Check if all of the buffer is passive speech. Start with checking the first + // block. + Vad::Activity activity = vad_->VoiceActivity( + &speech_buffer_[0], samples_per_10ms_frame * blocks_in_first_vad_call, + SampleRateHz()); + if (activity == Vad::kPassive && blocks_in_second_vad_call > 0) { + // Only check the second block if the first was passive. + activity = vad_->VoiceActivity( + &speech_buffer_[samples_per_10ms_frame * blocks_in_first_vad_call], + samples_per_10ms_frame * blocks_in_second_vad_call, SampleRateHz()); + } + + EncodedInfo info; + switch (activity) { + case Vad::kPassive: { + info = EncodePassive(frames_to_encode, encoded); + last_frame_active_ = false; + break; + } + case Vad::kActive: { + info = EncodeActive(frames_to_encode, encoded); + last_frame_active_ = true; + break; + } + default: { + RTC_CHECK_NOTREACHED(); + } + } + + speech_buffer_.erase( + speech_buffer_.begin(), + speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame); + rtp_timestamps_.erase(rtp_timestamps_.begin(), + rtp_timestamps_.begin() + frames_to_encode); + return info; +} + +void AudioEncoderCng::Reset() { + speech_encoder_->Reset(); + speech_buffer_.clear(); + rtp_timestamps_.clear(); + last_frame_active_ = true; + vad_->Reset(); + cng_encoder_.reset(new ComfortNoiseEncoder( + SampleRateHz(), sid_frame_interval_ms_, num_cng_coefficients_)); +} + +bool AudioEncoderCng::SetFec(bool enable) { + return speech_encoder_->SetFec(enable); +} + +bool AudioEncoderCng::SetDtx(bool enable) { + return speech_encoder_->SetDtx(enable); +} + +bool AudioEncoderCng::SetApplication(Application application) { + return speech_encoder_->SetApplication(application); +} + +void AudioEncoderCng::SetMaxPlaybackRate(int frequency_hz) { + speech_encoder_->SetMaxPlaybackRate(frequency_hz); +} + +rtc::ArrayView<std::unique_ptr<AudioEncoder>> +AudioEncoderCng::ReclaimContainedEncoders() { + return rtc::ArrayView<std::unique_ptr<AudioEncoder>>(&speech_encoder_, 1); +} + +void AudioEncoderCng::OnReceivedUplinkPacketLossFraction( + float uplink_packet_loss_fraction) { + speech_encoder_->OnReceivedUplinkPacketLossFraction( + uplink_packet_loss_fraction); +} + +void AudioEncoderCng::OnReceivedUplinkBandwidth( + int target_audio_bitrate_bps, + absl::optional<int64_t> bwe_period_ms) { + speech_encoder_->OnReceivedUplinkBandwidth(target_audio_bitrate_bps, + bwe_period_ms); +} + +absl::optional<std::pair<TimeDelta, TimeDelta>> +AudioEncoderCng::GetFrameLengthRange() const { + return speech_encoder_->GetFrameLengthRange(); +} + +AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive( + size_t frames_to_encode, + rtc::Buffer* encoded) { + bool force_sid = last_frame_active_; + bool output_produced = false; + const size_t samples_per_10ms_frame = SamplesPer10msFrame(); + AudioEncoder::EncodedInfo info; + + for (size_t i = 0; i < frames_to_encode; ++i) { + // It's important not to pass &info.encoded_bytes directly to + // WebRtcCng_Encode(), since later loop iterations may return zero in + // that value, in which case we don't want to overwrite any value from + // an earlier iteration. + size_t encoded_bytes_tmp = + cng_encoder_->Encode(rtc::ArrayView<const int16_t>( + &speech_buffer_[i * samples_per_10ms_frame], + samples_per_10ms_frame), + force_sid, encoded); + + if (encoded_bytes_tmp > 0) { + RTC_CHECK(!output_produced); + info.encoded_bytes = encoded_bytes_tmp; + output_produced = true; + force_sid = false; + } + } + + info.encoded_timestamp = rtp_timestamps_.front(); + info.payload_type = cng_payload_type_; + info.send_even_if_empty = true; + info.speech = false; + return info; +} + +AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(size_t frames_to_encode, + rtc::Buffer* encoded) { + const size_t samples_per_10ms_frame = SamplesPer10msFrame(); + AudioEncoder::EncodedInfo info; + for (size_t i = 0; i < frames_to_encode; ++i) { + info = + speech_encoder_->Encode(rtp_timestamps_.front(), + rtc::ArrayView<const int16_t>( + &speech_buffer_[i * samples_per_10ms_frame], + samples_per_10ms_frame), + encoded); + if (i + 1 == frames_to_encode) { + RTC_CHECK_GT(info.encoded_bytes, 0) << "Encoder didn't deliver data."; + } else { + RTC_CHECK_EQ(info.encoded_bytes, 0) + << "Encoder delivered data too early."; + } + } + return info; +} + +size_t AudioEncoderCng::SamplesPer10msFrame() const { + return rtc::CheckedDivExact(10 * SampleRateHz(), 1000); +} + +} // namespace + +AudioEncoderCngConfig::AudioEncoderCngConfig() = default; +AudioEncoderCngConfig::AudioEncoderCngConfig(AudioEncoderCngConfig&&) = default; +AudioEncoderCngConfig::~AudioEncoderCngConfig() = default; + +bool AudioEncoderCngConfig::IsOk() const { + if (num_channels != 1) + return false; + if (!speech_encoder) + return false; + if (num_channels != speech_encoder->NumChannels()) + return false; + if (sid_frame_interval_ms < + static_cast<int>(speech_encoder->Max10MsFramesInAPacket() * 10)) + return false; + if (num_cng_coefficients > WEBRTC_CNG_MAX_LPC_ORDER || + num_cng_coefficients <= 0) + return false; + return true; +} + +std::unique_ptr<AudioEncoder> CreateComfortNoiseEncoder( + AudioEncoderCngConfig&& config) { + return std::make_unique<AudioEncoderCng>(std::move(config)); +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.h b/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.h new file mode 100644 index 0000000000..8a1183489f --- /dev/null +++ b/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_CODING_CODECS_CNG_AUDIO_ENCODER_CNG_H_ +#define MODULES_AUDIO_CODING_CODECS_CNG_AUDIO_ENCODER_CNG_H_ + +#include <stddef.h> + +#include <memory> + +#include "api/audio_codecs/audio_encoder.h" +#include "common_audio/vad/include/vad.h" + +namespace webrtc { + +struct AudioEncoderCngConfig { + // Moveable, not copyable. + AudioEncoderCngConfig(); + AudioEncoderCngConfig(AudioEncoderCngConfig&&); + ~AudioEncoderCngConfig(); + + bool IsOk() const; + + size_t num_channels = 1; + int payload_type = 13; + std::unique_ptr<AudioEncoder> speech_encoder; + Vad::Aggressiveness vad_mode = Vad::kVadNormal; + int sid_frame_interval_ms = 100; + int num_cng_coefficients = 8; + // The Vad pointer is mainly for testing. If a NULL pointer is passed, the + // AudioEncoderCng creates (and destroys) a Vad object internally. If an + // object is passed, the AudioEncoderCng assumes ownership of the Vad + // object. + Vad* vad = nullptr; +}; + +std::unique_ptr<AudioEncoder> CreateComfortNoiseEncoder( + AudioEncoderCngConfig&& config); + +} // namespace webrtc + +#endif // MODULES_AUDIO_CODING_CODECS_CNG_AUDIO_ENCODER_CNG_H_ diff --git a/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc b/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc new file mode 100644 index 0000000000..c688004363 --- /dev/null +++ b/third_party/libwebrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_coding/codecs/cng/audio_encoder_cng.h" + +#include <memory> +#include <vector> + +#include "common_audio/vad/mock/mock_vad.h" +#include "rtc_base/numerics/safe_conversions.h" +#include "test/gtest.h" +#include "test/mock_audio_encoder.h" +#include "test/testsupport/rtc_expect_death.h" + +using ::testing::_; +using ::testing::Eq; +using ::testing::InSequence; +using ::testing::Invoke; +using ::testing::Not; +using ::testing::Optional; +using ::testing::Return; +using ::testing::SetArgPointee; + +namespace webrtc { + +namespace { +static const size_t kMaxNumSamples = 48 * 10 * 2; // 10 ms @ 48 kHz stereo. +static const size_t kMockReturnEncodedBytes = 17; +static const int kCngPayloadType = 18; +} // namespace + +class AudioEncoderCngTest : public ::testing::Test { + protected: + AudioEncoderCngTest() + : mock_encoder_owner_(new MockAudioEncoder), + mock_encoder_(mock_encoder_owner_.get()), + mock_vad_(new MockVad), + timestamp_(4711), + num_audio_samples_10ms_(0), + sample_rate_hz_(8000) { + memset(audio_, 0, kMaxNumSamples * 2); + EXPECT_CALL(*mock_encoder_, NumChannels()).WillRepeatedly(Return(1)); + } + + AudioEncoderCngTest(const AudioEncoderCngTest&) = delete; + AudioEncoderCngTest& operator=(const AudioEncoderCngTest&) = delete; + + void TearDown() override { + EXPECT_CALL(*mock_vad_, Die()).Times(1); + cng_.reset(); + } + + AudioEncoderCngConfig MakeCngConfig() { + AudioEncoderCngConfig config; + config.speech_encoder = std::move(mock_encoder_owner_); + EXPECT_TRUE(config.speech_encoder); + + // Let the AudioEncoderCng object use a MockVad instead of its internally + // created Vad object. + config.vad = mock_vad_; + config.payload_type = kCngPayloadType; + + return config; + } + + void CreateCng(AudioEncoderCngConfig&& config) { + num_audio_samples_10ms_ = static_cast<size_t>(10 * sample_rate_hz_ / 1000); + ASSERT_LE(num_audio_samples_10ms_, kMaxNumSamples); + if (config.speech_encoder) { + EXPECT_CALL(*mock_encoder_, SampleRateHz()) + .WillRepeatedly(Return(sample_rate_hz_)); + // Max10MsFramesInAPacket() is just used to verify that the SID frame + // period is not too small. The return value does not matter that much, + // as long as it is smaller than 10. + EXPECT_CALL(*mock_encoder_, Max10MsFramesInAPacket()) + .WillOnce(Return(1u)); + } + cng_ = CreateComfortNoiseEncoder(std::move(config)); + } + + void Encode() { + ASSERT_TRUE(cng_) << "Must call CreateCng() first."; + encoded_info_ = cng_->Encode( + timestamp_, + rtc::ArrayView<const int16_t>(audio_, num_audio_samples_10ms_), + &encoded_); + timestamp_ += static_cast<uint32_t>(num_audio_samples_10ms_); + } + + // Expect `num_calls` calls to the encoder, all successful. The last call + // claims to have encoded `kMockReturnEncodedBytes` bytes, and all the + // preceding ones 0 bytes. + void ExpectEncodeCalls(size_t num_calls) { + InSequence s; + AudioEncoder::EncodedInfo info; + for (size_t j = 0; j < num_calls - 1; ++j) { + EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).WillOnce(Return(info)); + } + info.encoded_bytes = kMockReturnEncodedBytes; + EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)) + .WillOnce( + Invoke(MockAudioEncoder::FakeEncoding(kMockReturnEncodedBytes))); + } + + // Verifies that the cng_ object waits until it has collected + // `blocks_per_frame` blocks of audio, and then dispatches all of them to + // the underlying codec (speech or cng). + void CheckBlockGrouping(size_t blocks_per_frame, bool active_speech) { + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(blocks_per_frame)); + auto config = MakeCngConfig(); + const int num_cng_coefficients = config.num_cng_coefficients; + CreateCng(std::move(config)); + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillRepeatedly(Return(active_speech ? Vad::kActive : Vad::kPassive)); + + // Don't expect any calls to the encoder yet. + EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).Times(0); + for (size_t i = 0; i < blocks_per_frame - 1; ++i) { + Encode(); + EXPECT_EQ(0u, encoded_info_.encoded_bytes); + } + if (active_speech) + ExpectEncodeCalls(blocks_per_frame); + Encode(); + if (active_speech) { + EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes); + } else { + EXPECT_EQ(static_cast<size_t>(num_cng_coefficients + 1), + encoded_info_.encoded_bytes); + } + } + + // Verifies that the audio is partitioned into larger blocks before calling + // the VAD. + void CheckVadInputSize(int input_frame_size_ms, + int expected_first_block_size_ms, + int expected_second_block_size_ms) { + const size_t blocks_per_frame = + static_cast<size_t>(input_frame_size_ms / 10); + + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(blocks_per_frame)); + + // Expect nothing to happen before the last block is sent to cng_. + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)).Times(0); + for (size_t i = 0; i < blocks_per_frame - 1; ++i) { + Encode(); + } + + // Let the VAD decision be passive, since an active decision may lead to + // early termination of the decision loop. + InSequence s; + EXPECT_CALL( + *mock_vad_, + VoiceActivity(_, expected_first_block_size_ms * sample_rate_hz_ / 1000, + sample_rate_hz_)) + .WillOnce(Return(Vad::kPassive)); + if (expected_second_block_size_ms > 0) { + EXPECT_CALL(*mock_vad_, + VoiceActivity( + _, expected_second_block_size_ms * sample_rate_hz_ / 1000, + sample_rate_hz_)) + .WillOnce(Return(Vad::kPassive)); + } + + // With this call to Encode(), `mock_vad_` should be called according to the + // above expectations. + Encode(); + } + + // Tests a frame with both active and passive speech. Returns true if the + // decision was active speech, false if it was passive. + bool CheckMixedActivePassive(Vad::Activity first_type, + Vad::Activity second_type) { + // Set the speech encoder frame size to 60 ms, to ensure that the VAD will + // be called twice. + const size_t blocks_per_frame = 6; + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(blocks_per_frame)); + InSequence s; + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillOnce(Return(first_type)); + if (first_type == Vad::kPassive) { + // Expect a second call to the VAD only if the first frame was passive. + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillOnce(Return(second_type)); + } + encoded_info_.payload_type = 0; + for (size_t i = 0; i < blocks_per_frame; ++i) { + Encode(); + } + return encoded_info_.payload_type != kCngPayloadType; + } + + std::unique_ptr<AudioEncoder> cng_; + std::unique_ptr<MockAudioEncoder> mock_encoder_owner_; + MockAudioEncoder* mock_encoder_; + MockVad* mock_vad_; // Ownership is transferred to `cng_`. + uint32_t timestamp_; + int16_t audio_[kMaxNumSamples]; + size_t num_audio_samples_10ms_; + rtc::Buffer encoded_; + AudioEncoder::EncodedInfo encoded_info_; + int sample_rate_hz_; +}; + +TEST_F(AudioEncoderCngTest, CreateAndDestroy) { + CreateCng(MakeCngConfig()); +} + +TEST_F(AudioEncoderCngTest, CheckFrameSizePropagation) { + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillOnce(Return(17U)); + EXPECT_EQ(17U, cng_->Num10MsFramesInNextPacket()); +} + +TEST_F(AudioEncoderCngTest, CheckTargetAudioBitratePropagation) { + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_encoder_, + OnReceivedUplinkBandwidth(4711, absl::optional<int64_t>())); + cng_->OnReceivedUplinkBandwidth(4711, absl::nullopt); +} + +TEST_F(AudioEncoderCngTest, CheckPacketLossFractionPropagation) { + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_encoder_, OnReceivedUplinkPacketLossFraction(0.5)); + cng_->OnReceivedUplinkPacketLossFraction(0.5); +} + +TEST_F(AudioEncoderCngTest, CheckGetFrameLengthRangePropagation) { + CreateCng(MakeCngConfig()); + auto expected_range = + std::make_pair(TimeDelta::Millis(20), TimeDelta::Millis(20)); + EXPECT_CALL(*mock_encoder_, GetFrameLengthRange()) + .WillRepeatedly(Return(absl::make_optional(expected_range))); + EXPECT_THAT(cng_->GetFrameLengthRange(), Optional(Eq(expected_range))); +} + +TEST_F(AudioEncoderCngTest, EncodeCallsVad) { + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(1U)); + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillOnce(Return(Vad::kPassive)); + Encode(); +} + +TEST_F(AudioEncoderCngTest, EncodeCollects1BlockPassiveSpeech) { + CheckBlockGrouping(1, false); +} + +TEST_F(AudioEncoderCngTest, EncodeCollects2BlocksPassiveSpeech) { + CheckBlockGrouping(2, false); +} + +TEST_F(AudioEncoderCngTest, EncodeCollects3BlocksPassiveSpeech) { + CheckBlockGrouping(3, false); +} + +TEST_F(AudioEncoderCngTest, EncodeCollects1BlockActiveSpeech) { + CheckBlockGrouping(1, true); +} + +TEST_F(AudioEncoderCngTest, EncodeCollects2BlocksActiveSpeech) { + CheckBlockGrouping(2, true); +} + +TEST_F(AudioEncoderCngTest, EncodeCollects3BlocksActiveSpeech) { + CheckBlockGrouping(3, true); +} + +TEST_F(AudioEncoderCngTest, EncodePassive) { + const size_t kBlocksPerFrame = 3; + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(kBlocksPerFrame)); + auto config = MakeCngConfig(); + const auto sid_frame_interval_ms = config.sid_frame_interval_ms; + const auto num_cng_coefficients = config.num_cng_coefficients; + CreateCng(std::move(config)); + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillRepeatedly(Return(Vad::kPassive)); + // Expect no calls at all to the speech encoder mock. + EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).Times(0); + uint32_t expected_timestamp = timestamp_; + for (size_t i = 0; i < 100; ++i) { + Encode(); + // Check if it was time to call the cng encoder. This is done once every + // `kBlocksPerFrame` calls. + if ((i + 1) % kBlocksPerFrame == 0) { + // Now check if a SID interval has elapsed. + if ((i % (sid_frame_interval_ms / 10)) < kBlocksPerFrame) { + // If so, verify that we got a CNG encoding. + EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); + EXPECT_FALSE(encoded_info_.speech); + EXPECT_EQ(static_cast<size_t>(num_cng_coefficients) + 1, + encoded_info_.encoded_bytes); + EXPECT_EQ(expected_timestamp, encoded_info_.encoded_timestamp); + } + expected_timestamp += rtc::checked_cast<uint32_t>( + kBlocksPerFrame * num_audio_samples_10ms_); + } else { + // Otherwise, expect no output. + EXPECT_EQ(0u, encoded_info_.encoded_bytes); + } + } +} + +// Verifies that the correct action is taken for frames with both active and +// passive speech. +TEST_F(AudioEncoderCngTest, MixedActivePassive) { + CreateCng(MakeCngConfig()); + + // All of the frame is active speech. + ExpectEncodeCalls(6); + EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kActive)); + EXPECT_TRUE(encoded_info_.speech); + + // First half of the frame is active speech. + ExpectEncodeCalls(6); + EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kPassive)); + EXPECT_TRUE(encoded_info_.speech); + + // Second half of the frame is active speech. + ExpectEncodeCalls(6); + EXPECT_TRUE(CheckMixedActivePassive(Vad::kPassive, Vad::kActive)); + EXPECT_TRUE(encoded_info_.speech); + + // All of the frame is passive speech. Expect no calls to `mock_encoder_`. + EXPECT_FALSE(CheckMixedActivePassive(Vad::kPassive, Vad::kPassive)); + EXPECT_FALSE(encoded_info_.speech); +} + +// These tests verify that the audio is partitioned into larger blocks before +// calling the VAD. +// The parameters for CheckVadInputSize are: +// CheckVadInputSize(frame_size, expected_first_block_size, +// expected_second_block_size); +TEST_F(AudioEncoderCngTest, VadInputSize10Ms) { + CreateCng(MakeCngConfig()); + CheckVadInputSize(10, 10, 0); +} +TEST_F(AudioEncoderCngTest, VadInputSize20Ms) { + CreateCng(MakeCngConfig()); + CheckVadInputSize(20, 20, 0); +} +TEST_F(AudioEncoderCngTest, VadInputSize30Ms) { + CreateCng(MakeCngConfig()); + CheckVadInputSize(30, 30, 0); +} +TEST_F(AudioEncoderCngTest, VadInputSize40Ms) { + CreateCng(MakeCngConfig()); + CheckVadInputSize(40, 20, 20); +} +TEST_F(AudioEncoderCngTest, VadInputSize50Ms) { + CreateCng(MakeCngConfig()); + CheckVadInputSize(50, 30, 20); +} +TEST_F(AudioEncoderCngTest, VadInputSize60Ms) { + CreateCng(MakeCngConfig()); + CheckVadInputSize(60, 30, 30); +} + +// Verifies that the correct payload type is set when CNG is encoded. +TEST_F(AudioEncoderCngTest, VerifyCngPayloadType) { + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).Times(0); + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()).WillOnce(Return(1U)); + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillOnce(Return(Vad::kPassive)); + encoded_info_.payload_type = 0; + Encode(); + EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); +} + +// Verifies that a SID frame is encoded immediately as the signal changes from +// active speech to passive. +TEST_F(AudioEncoderCngTest, VerifySidFrameAfterSpeech) { + auto config = MakeCngConfig(); + const auto num_cng_coefficients = config.num_cng_coefficients; + CreateCng(std::move(config)); + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(1U)); + // Start with encoding noise. + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .Times(2) + .WillRepeatedly(Return(Vad::kPassive)); + Encode(); + EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); + EXPECT_EQ(static_cast<size_t>(num_cng_coefficients) + 1, + encoded_info_.encoded_bytes); + // Encode again, and make sure we got no frame at all (since the SID frame + // period is 100 ms by default). + Encode(); + EXPECT_EQ(0u, encoded_info_.encoded_bytes); + + // Now encode active speech. + encoded_info_.payload_type = 0; + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillOnce(Return(Vad::kActive)); + EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)) + .WillOnce( + Invoke(MockAudioEncoder::FakeEncoding(kMockReturnEncodedBytes))); + Encode(); + EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes); + + // Go back to noise again, and verify that a SID frame is emitted. + EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) + .WillOnce(Return(Vad::kPassive)); + Encode(); + EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); + EXPECT_EQ(static_cast<size_t>(num_cng_coefficients) + 1, + encoded_info_.encoded_bytes); +} + +// Resetting the CNG should reset both the VAD and the encoder. +TEST_F(AudioEncoderCngTest, Reset) { + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_encoder_, Reset()).Times(1); + EXPECT_CALL(*mock_vad_, Reset()).Times(1); + cng_->Reset(); +} + +#if GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) + +// This test fixture tests various error conditions that makes the +// AudioEncoderCng die via CHECKs. +class AudioEncoderCngDeathTest : public AudioEncoderCngTest { + protected: + AudioEncoderCngDeathTest() : AudioEncoderCngTest() { + EXPECT_CALL(*mock_vad_, Die()).Times(1); + delete mock_vad_; + mock_vad_ = nullptr; + } + + // Override AudioEncoderCngTest::TearDown, since that one expects a call to + // the destructor of `mock_vad_`. In this case, that object is already + // deleted. + void TearDown() override { cng_.reset(); } + + AudioEncoderCngConfig MakeCngConfig() { + // Don't provide a Vad mock object, since it would leak when the test dies. + auto config = AudioEncoderCngTest::MakeCngConfig(); + config.vad = nullptr; + return config; + } + + void TryWrongNumCoefficients(int num) { + RTC_EXPECT_DEATH( + [&] { + auto config = MakeCngConfig(); + config.num_cng_coefficients = num; + CreateCng(std::move(config)); + }(), + "Invalid configuration"); + } +}; + +TEST_F(AudioEncoderCngDeathTest, WrongFrameSize) { + CreateCng(MakeCngConfig()); + num_audio_samples_10ms_ *= 2; // 20 ms frame. + RTC_EXPECT_DEATH(Encode(), ""); + num_audio_samples_10ms_ = 0; // Zero samples. + RTC_EXPECT_DEATH(Encode(), ""); +} + +TEST_F(AudioEncoderCngDeathTest, WrongNumCoefficientsA) { + TryWrongNumCoefficients(-1); +} + +TEST_F(AudioEncoderCngDeathTest, WrongNumCoefficientsB) { + TryWrongNumCoefficients(0); +} + +TEST_F(AudioEncoderCngDeathTest, WrongNumCoefficientsC) { + TryWrongNumCoefficients(13); +} + +TEST_F(AudioEncoderCngDeathTest, NullSpeechEncoder) { + auto config = MakeCngConfig(); + config.speech_encoder = nullptr; + RTC_EXPECT_DEATH(CreateCng(std::move(config)), ""); +} + +TEST_F(AudioEncoderCngDeathTest, StereoEncoder) { + EXPECT_CALL(*mock_encoder_, NumChannels()).WillRepeatedly(Return(2)); + RTC_EXPECT_DEATH(CreateCng(MakeCngConfig()), "Invalid configuration"); +} + +TEST_F(AudioEncoderCngDeathTest, StereoConfig) { + RTC_EXPECT_DEATH( + [&] { + auto config = MakeCngConfig(); + config.num_channels = 2; + CreateCng(std::move(config)); + }(), + "Invalid configuration"); +} + +TEST_F(AudioEncoderCngDeathTest, EncoderFrameSizeTooLarge) { + CreateCng(MakeCngConfig()); + EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) + .WillRepeatedly(Return(7U)); + for (int i = 0; i < 6; ++i) + Encode(); + RTC_EXPECT_DEATH( + Encode(), "Frame size cannot be larger than 60 ms when using VAD/CNG."); +} + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace webrtc diff --git a/third_party/libwebrtc/modules/audio_coding/codecs/cng/cng_unittest.cc b/third_party/libwebrtc/modules/audio_coding/codecs/cng/cng_unittest.cc new file mode 100644 index 0000000000..0e6ab79394 --- /dev/null +++ b/third_party/libwebrtc/modules/audio_coding/codecs/cng/cng_unittest.cc @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <memory> +#include <string> + +#include "modules/audio_coding/codecs/cng/webrtc_cng.h" +#include "test/gtest.h" +#include "test/testsupport/file_utils.h" + +namespace webrtc { + +enum { + kSidShortIntervalUpdate = 1, + kSidNormalIntervalUpdate = 100, + kSidLongIntervalUpdate = 10000 +}; + +enum : size_t { + kCNGNumParamsLow = 0, + kCNGNumParamsNormal = 8, + kCNGNumParamsHigh = WEBRTC_CNG_MAX_LPC_ORDER, + kCNGNumParamsTooHigh = WEBRTC_CNG_MAX_LPC_ORDER + 1 +}; + +enum { kNoSid, kForceSid }; + +class CngTest : public ::testing::Test { + protected: + virtual void SetUp(); + + void TestCngEncode(int sample_rate_hz, int quality); + + int16_t speech_data_[640]; // Max size of CNG internal buffers. +}; + +class CngDeathTest : public CngTest {}; + +void CngTest::SetUp() { + FILE* input_file; + const std::string file_name = + webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm"); + input_file = fopen(file_name.c_str(), "rb"); + ASSERT_TRUE(input_file != NULL); + ASSERT_EQ(640, static_cast<int32_t>( + fread(speech_data_, sizeof(int16_t), 640, input_file))); + fclose(input_file); + input_file = NULL; +} + +void CngTest::TestCngEncode(int sample_rate_hz, int quality) { + const size_t num_samples_10ms = rtc::CheckedDivExact(sample_rate_hz, 100); + rtc::Buffer sid_data; + + ComfortNoiseEncoder cng_encoder(sample_rate_hz, kSidNormalIntervalUpdate, + quality); + EXPECT_EQ(0U, cng_encoder.Encode(rtc::ArrayView<const int16_t>( + speech_data_, num_samples_10ms), + kNoSid, &sid_data)); + EXPECT_EQ(static_cast<size_t>(quality + 1), + cng_encoder.Encode( + rtc::ArrayView<const int16_t>(speech_data_, num_samples_10ms), + kForceSid, &sid_data)); +} + +#if GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) +// Create CNG encoder, init with faulty values, free CNG encoder. +TEST_F(CngDeathTest, CngInitFail) { + // Call with too few parameters. + EXPECT_DEATH( + { + ComfortNoiseEncoder(8000, kSidNormalIntervalUpdate, kCNGNumParamsLow); + }, + ""); + // Call with too many parameters. + EXPECT_DEATH( + { + ComfortNoiseEncoder(8000, kSidNormalIntervalUpdate, + kCNGNumParamsTooHigh); + }, + ""); +} + +// Encode Cng with too long input vector. +TEST_F(CngDeathTest, CngEncodeTooLong) { + rtc::Buffer sid_data; + + // Create encoder. + ComfortNoiseEncoder cng_encoder(8000, kSidNormalIntervalUpdate, + kCNGNumParamsNormal); + // Run encoder with too much data. + EXPECT_DEATH( + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 641), + kNoSid, &sid_data), + ""); +} +#endif // GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) + +TEST_F(CngTest, CngEncode8000) { + TestCngEncode(8000, kCNGNumParamsNormal); +} + +TEST_F(CngTest, CngEncode16000) { + TestCngEncode(16000, kCNGNumParamsNormal); +} + +TEST_F(CngTest, CngEncode32000) { + TestCngEncode(32000, kCNGNumParamsHigh); +} + +TEST_F(CngTest, CngEncode48000) { + TestCngEncode(48000, kCNGNumParamsNormal); +} + +TEST_F(CngTest, CngEncode64000) { + TestCngEncode(64000, kCNGNumParamsNormal); +} + +// Update SID parameters, for both 9 and 16 parameters. +TEST_F(CngTest, CngUpdateSid) { + rtc::Buffer sid_data; + + // Create and initialize encoder and decoder. + ComfortNoiseEncoder cng_encoder(16000, kSidNormalIntervalUpdate, + kCNGNumParamsNormal); + ComfortNoiseDecoder cng_decoder; + + // Run normal Encode and UpdateSid. + EXPECT_EQ(kCNGNumParamsNormal + 1, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kForceSid, &sid_data)); + cng_decoder.UpdateSid(sid_data); + + // Reinit with new length. + cng_encoder.Reset(16000, kSidNormalIntervalUpdate, kCNGNumParamsHigh); + cng_decoder.Reset(); + + // Expect 0 because of unstable parameters after switching length. + EXPECT_EQ(0U, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kForceSid, &sid_data)); + EXPECT_EQ( + kCNGNumParamsHigh + 1, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_ + 160, 160), + kForceSid, &sid_data)); + cng_decoder.UpdateSid( + rtc::ArrayView<const uint8_t>(sid_data.data(), kCNGNumParamsNormal + 1)); +} + +// Update SID parameters, with wrong parameters or without calling decode. +TEST_F(CngTest, CngUpdateSidErroneous) { + rtc::Buffer sid_data; + + // Encode. + ComfortNoiseEncoder cng_encoder(16000, kSidNormalIntervalUpdate, + kCNGNumParamsNormal); + ComfortNoiseDecoder cng_decoder; + EXPECT_EQ(kCNGNumParamsNormal + 1, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kForceSid, &sid_data)); + + // First run with valid parameters, then with too many CNG parameters. + // The function will operate correctly by only reading the maximum number of + // parameters, skipping the extra. + EXPECT_EQ(kCNGNumParamsNormal + 1, sid_data.size()); + cng_decoder.UpdateSid(sid_data); + + // Make sure the input buffer is large enough. Since Encode() appends data, we + // need to set the size manually only afterwards, or the buffer will be bigger + // than anticipated. + sid_data.SetSize(kCNGNumParamsTooHigh + 1); + cng_decoder.UpdateSid(sid_data); +} + +// Test to generate cng data, by forcing SID. Both normal and faulty condition. +TEST_F(CngTest, CngGenerate) { + rtc::Buffer sid_data; + int16_t out_data[640]; + + // Create and initialize encoder and decoder. + ComfortNoiseEncoder cng_encoder(16000, kSidNormalIntervalUpdate, + kCNGNumParamsNormal); + ComfortNoiseDecoder cng_decoder; + + // Normal Encode. + EXPECT_EQ(kCNGNumParamsNormal + 1, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kForceSid, &sid_data)); + + // Normal UpdateSid. + cng_decoder.UpdateSid(sid_data); + + // Two normal Generate, one with new_period. + EXPECT_TRUE(cng_decoder.Generate(rtc::ArrayView<int16_t>(out_data, 640), 1)); + EXPECT_TRUE(cng_decoder.Generate(rtc::ArrayView<int16_t>(out_data, 640), 0)); + + // Call Genereate with too much data. + EXPECT_FALSE(cng_decoder.Generate(rtc::ArrayView<int16_t>(out_data, 641), 0)); +} + +// Test automatic SID. +TEST_F(CngTest, CngAutoSid) { + rtc::Buffer sid_data; + + // Create and initialize encoder and decoder. + ComfortNoiseEncoder cng_encoder(16000, kSidNormalIntervalUpdate, + kCNGNumParamsNormal); + ComfortNoiseDecoder cng_decoder; + + // Normal Encode, 100 msec, where no SID data should be generated. + for (int i = 0; i < 10; i++) { + EXPECT_EQ( + 0U, cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kNoSid, &sid_data)); + } + + // We have reached 100 msec, and SID data should be generated. + EXPECT_EQ(kCNGNumParamsNormal + 1, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kNoSid, &sid_data)); +} + +// Test automatic SID, with very short interval. +TEST_F(CngTest, CngAutoSidShort) { + rtc::Buffer sid_data; + + // Create and initialize encoder and decoder. + ComfortNoiseEncoder cng_encoder(16000, kSidShortIntervalUpdate, + kCNGNumParamsNormal); + ComfortNoiseDecoder cng_decoder; + + // First call will never generate SID, unless forced to. + EXPECT_EQ(0U, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kNoSid, &sid_data)); + + // Normal Encode, 100 msec, SID data should be generated all the time. + for (int i = 0; i < 10; i++) { + EXPECT_EQ( + kCNGNumParamsNormal + 1, + cng_encoder.Encode(rtc::ArrayView<const int16_t>(speech_data_, 160), + kNoSid, &sid_data)); + } +} + +} // namespace webrtc diff --git a/third_party/libwebrtc/modules/audio_coding/codecs/cng/webrtc_cng.cc b/third_party/libwebrtc/modules/audio_coding/codecs/cng/webrtc_cng.cc new file mode 100644 index 0000000000..48f1b8c296 --- /dev/null +++ b/third_party/libwebrtc/modules/audio_coding/codecs/cng/webrtc_cng.cc @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_coding/codecs/cng/webrtc_cng.h" + +#include <algorithm> + +#include "common_audio/signal_processing/include/signal_processing_library.h" +#include "rtc_base/checks.h" +#include "rtc_base/numerics/safe_conversions.h" + +namespace webrtc { + +namespace { + +const size_t kCngMaxOutsizeOrder = 640; + +// TODO(ossu): Rename the left-over WebRtcCng according to style guide. +void WebRtcCng_K2a16(int16_t* k, int useOrder, int16_t* a); + +const int32_t WebRtcCng_kDbov[94] = { + 1081109975, 858756178, 682134279, 541838517, 430397633, 341876992, + 271562548, 215709799, 171344384, 136103682, 108110997, 85875618, + 68213428, 54183852, 43039763, 34187699, 27156255, 21570980, + 17134438, 13610368, 10811100, 8587562, 6821343, 5418385, + 4303976, 3418770, 2715625, 2157098, 1713444, 1361037, + 1081110, 858756, 682134, 541839, 430398, 341877, + 271563, 215710, 171344, 136104, 108111, 85876, + 68213, 54184, 43040, 34188, 27156, 21571, + 17134, 13610, 10811, 8588, 6821, 5418, + 4304, 3419, 2716, 2157, 1713, 1361, + 1081, 859, 682, 542, 430, 342, + 272, 216, 171, 136, 108, 86, + 68, 54, 43, 34, 27, 22, + 17, 14, 11, 9, 7, 5, + 4, 3, 3, 2, 2, 1, + 1, 1, 1, 1}; + +const int16_t WebRtcCng_kCorrWindow[WEBRTC_CNG_MAX_LPC_ORDER] = { + 32702, 32636, 32570, 32505, 32439, 32374, + 32309, 32244, 32179, 32114, 32049, 31985}; + +} // namespace + +ComfortNoiseDecoder::ComfortNoiseDecoder() { + /* Needed to get the right function pointers in SPLIB. */ + Reset(); +} + +void ComfortNoiseDecoder::Reset() { + dec_seed_ = 7777; /* For debugging only. */ + dec_target_energy_ = 0; + dec_used_energy_ = 0; + for (auto& c : dec_target_reflCoefs_) + c = 0; + for (auto& c : dec_used_reflCoefs_) + c = 0; + for (auto& c : dec_filtstate_) + c = 0; + for (auto& c : dec_filtstateLow_) + c = 0; + dec_order_ = 5; + dec_target_scale_factor_ = 0; + dec_used_scale_factor_ = 0; +} + +void ComfortNoiseDecoder::UpdateSid(rtc::ArrayView<const uint8_t> sid) { + int16_t refCs[WEBRTC_CNG_MAX_LPC_ORDER]; + int32_t targetEnergy; + size_t length = sid.size(); + /* Throw away reflection coefficients of higher order than we can handle. */ + if (length > (WEBRTC_CNG_MAX_LPC_ORDER + 1)) + length = WEBRTC_CNG_MAX_LPC_ORDER + 1; + + dec_order_ = static_cast<uint16_t>(length - 1); + + uint8_t sid0 = std::min<uint8_t>(sid[0], 93); + targetEnergy = WebRtcCng_kDbov[sid0]; + /* Take down target energy to 75%. */ + targetEnergy = targetEnergy >> 1; + targetEnergy += targetEnergy >> 2; + + dec_target_energy_ = targetEnergy; + + /* Reconstruct coeffs with tweak for WebRtc implementation of RFC3389. */ + if (dec_order_ == WEBRTC_CNG_MAX_LPC_ORDER) { + for (size_t i = 0; i < (dec_order_); i++) { + refCs[i] = sid[i + 1] << 8; /* Q7 to Q15*/ + dec_target_reflCoefs_[i] = refCs[i]; + } + } else { + for (size_t i = 0; i < (dec_order_); i++) { + refCs[i] = (sid[i + 1] - 127) * (1 << 8); /* Q7 to Q15. */ + dec_target_reflCoefs_[i] = refCs[i]; + } + } + + for (size_t i = (dec_order_); i < WEBRTC_CNG_MAX_LPC_ORDER; i++) { + refCs[i] = 0; + dec_target_reflCoefs_[i] = refCs[i]; + } +} + +bool ComfortNoiseDecoder::Generate(rtc::ArrayView<int16_t> out_data, + bool new_period) { + int16_t excitation[kCngMaxOutsizeOrder]; + int16_t low[kCngMaxOutsizeOrder]; + int16_t lpPoly[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int16_t ReflBetaStd = 26214; /* 0.8 in q15. */ + int16_t ReflBetaCompStd = 6553; /* 0.2 in q15. */ + int16_t ReflBetaNewP = 19661; /* 0.6 in q15. */ + int16_t ReflBetaCompNewP = 13107; /* 0.4 in q15. */ + int16_t Beta, BetaC; /* These are in Q15. */ + int32_t targetEnergy; + int16_t En; + int16_t temp16; + const size_t num_samples = out_data.size(); + + if (num_samples > kCngMaxOutsizeOrder) { + return false; + } + + if (new_period) { + dec_used_scale_factor_ = dec_target_scale_factor_; + Beta = ReflBetaNewP; + BetaC = ReflBetaCompNewP; + } else { + Beta = ReflBetaStd; + BetaC = ReflBetaCompStd; + } + + /* Calculate new scale factor in Q13 */ + dec_used_scale_factor_ = rtc::checked_cast<int16_t>( + WEBRTC_SPL_MUL_16_16_RSFT(dec_used_scale_factor_, Beta >> 2, 13) + + WEBRTC_SPL_MUL_16_16_RSFT(dec_target_scale_factor_, BetaC >> 2, 13)); + + dec_used_energy_ = dec_used_energy_ >> 1; + dec_used_energy_ += dec_target_energy_ >> 1; + + /* Do the same for the reflection coeffs, albeit in Q15. */ + for (size_t i = 0; i < WEBRTC_CNG_MAX_LPC_ORDER; i++) { + dec_used_reflCoefs_[i] = + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(dec_used_reflCoefs_[i], Beta, 15); + dec_used_reflCoefs_[i] += + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(dec_target_reflCoefs_[i], BetaC, 15); + } + + /* Compute the polynomial coefficients. */ + WebRtcCng_K2a16(dec_used_reflCoefs_, WEBRTC_CNG_MAX_LPC_ORDER, lpPoly); + + targetEnergy = dec_used_energy_; + + /* Calculate scaling factor based on filter energy. */ + En = 8192; /* 1.0 in Q13. */ + for (size_t i = 0; i < (WEBRTC_CNG_MAX_LPC_ORDER); i++) { + /* Floating point value for reference. + E *= 1.0 - (dec_used_reflCoefs_[i] / 32768.0) * + (dec_used_reflCoefs_[i] / 32768.0); + */ + + /* Same in fixed point. */ + /* K(i).^2 in Q15. */ + temp16 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(dec_used_reflCoefs_[i], + dec_used_reflCoefs_[i], 15); + /* 1 - K(i).^2 in Q15. */ + temp16 = 0x7fff - temp16; + En = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(En, temp16, 15); + } + + /* float scaling= sqrt(E * dec_target_energy_ / (1 << 24)); */ + + /* Calculate sqrt(En * target_energy / excitation energy) */ + targetEnergy = WebRtcSpl_Sqrt(dec_used_energy_); + + En = (int16_t)WebRtcSpl_Sqrt(En) << 6; + En = (En * 3) >> 1; /* 1.5 estimates sqrt(2). */ + dec_used_scale_factor_ = (int16_t)((En * targetEnergy) >> 12); + + /* Generate excitation. */ + /* Excitation energy per sample is 2.^24 - Q13 N(0,1). */ + for (size_t i = 0; i < num_samples; i++) { + excitation[i] = WebRtcSpl_RandN(&dec_seed_) >> 1; + } + + /* Scale to correct energy. */ + WebRtcSpl_ScaleVector(excitation, excitation, dec_used_scale_factor_, + num_samples, 13); + + /* `lpPoly` - Coefficients in Q12. + * `excitation` - Speech samples. + * `nst->dec_filtstate` - State preservation. + * `out_data` - Filtered speech samples. */ + WebRtcSpl_FilterAR(lpPoly, WEBRTC_CNG_MAX_LPC_ORDER + 1, excitation, + num_samples, dec_filtstate_, WEBRTC_CNG_MAX_LPC_ORDER, + dec_filtstateLow_, WEBRTC_CNG_MAX_LPC_ORDER, + out_data.data(), low, num_samples); + + return true; +} + +ComfortNoiseEncoder::ComfortNoiseEncoder(int fs, int interval, int quality) + : enc_nrOfCoefs_(quality), + enc_sampfreq_(fs), + enc_interval_(interval), + enc_msSinceSid_(0), + enc_Energy_(0), + enc_reflCoefs_{0}, + enc_corrVector_{0}, + enc_seed_(7777) /* For debugging only. */ { + RTC_CHECK_GT(quality, 0); + RTC_CHECK_LE(quality, WEBRTC_CNG_MAX_LPC_ORDER); +} + +void ComfortNoiseEncoder::Reset(int fs, int interval, int quality) { + RTC_CHECK_GT(quality, 0); + RTC_CHECK_LE(quality, WEBRTC_CNG_MAX_LPC_ORDER); + enc_nrOfCoefs_ = quality; + enc_sampfreq_ = fs; + enc_interval_ = interval; + enc_msSinceSid_ = 0; + enc_Energy_ = 0; + for (auto& c : enc_reflCoefs_) + c = 0; + for (auto& c : enc_corrVector_) + c = 0; + enc_seed_ = 7777; /* For debugging only. */ +} + +size_t ComfortNoiseEncoder::Encode(rtc::ArrayView<const int16_t> speech, + bool force_sid, + rtc::Buffer* output) { + int16_t arCoefs[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int32_t corrVector[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int16_t refCs[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int16_t hanningW[kCngMaxOutsizeOrder]; + int16_t ReflBeta = 19661; /* 0.6 in q15. */ + int16_t ReflBetaComp = 13107; /* 0.4 in q15. */ + int32_t outEnergy; + int outShifts; + size_t i; + int stab; + int acorrScale; + size_t index; + size_t ind, factor; + int32_t* bptr; + int32_t blo, bhi; + int16_t negate; + const int16_t* aptr; + int16_t speechBuf[kCngMaxOutsizeOrder]; + + const size_t num_samples = speech.size(); + RTC_CHECK_LE(num_samples, kCngMaxOutsizeOrder); + + for (i = 0; i < num_samples; i++) { + speechBuf[i] = speech[i]; + } + + factor = num_samples; + + /* Calculate energy and a coefficients. */ + outEnergy = WebRtcSpl_Energy(speechBuf, num_samples, &outShifts); + while (outShifts > 0) { + /* We can only do 5 shifts without destroying accuracy in + * division factor. */ + if (outShifts > 5) { + outEnergy <<= (outShifts - 5); + outShifts = 5; + } else { + factor /= 2; + outShifts--; + } + } + outEnergy = WebRtcSpl_DivW32W16(outEnergy, (int16_t)factor); + + if (outEnergy > 1) { + /* Create Hanning Window. */ + WebRtcSpl_GetHanningWindow(hanningW, num_samples / 2); + for (i = 0; i < (num_samples / 2); i++) + hanningW[num_samples - i - 1] = hanningW[i]; + + WebRtcSpl_ElementwiseVectorMult(speechBuf, hanningW, speechBuf, num_samples, + 14); + + WebRtcSpl_AutoCorrelation(speechBuf, num_samples, enc_nrOfCoefs_, + corrVector, &acorrScale); + + if (*corrVector == 0) + *corrVector = WEBRTC_SPL_WORD16_MAX; + + /* Adds the bandwidth expansion. */ + aptr = WebRtcCng_kCorrWindow; + bptr = corrVector; + + /* (zzz) lpc16_1 = 17+1+820+2+2 = 842 (ordo2=700). */ + for (ind = 0; ind < enc_nrOfCoefs_; ind++) { + /* The below code multiplies the 16 b corrWindow values (Q15) with + * the 32 b corrvector (Q0) and shifts the result down 15 steps. */ + negate = *bptr < 0; + if (negate) + *bptr = -*bptr; + + blo = (int32_t)*aptr * (*bptr & 0xffff); + bhi = ((blo >> 16) & 0xffff) + + ((int32_t)(*aptr++) * ((*bptr >> 16) & 0xffff)); + blo = (blo & 0xffff) | ((bhi & 0xffff) << 16); + + *bptr = (((bhi >> 16) & 0x7fff) << 17) | ((uint32_t)blo >> 15); + if (negate) + *bptr = -*bptr; + bptr++; + } + /* End of bandwidth expansion. */ + + stab = WebRtcSpl_LevinsonDurbin(corrVector, arCoefs, refCs, enc_nrOfCoefs_); + + if (!stab) { + /* Disregard from this frame */ + return 0; + } + + } else { + for (i = 0; i < enc_nrOfCoefs_; i++) + refCs[i] = 0; + } + + if (force_sid) { + /* Read instantaneous values instead of averaged. */ + for (i = 0; i < enc_nrOfCoefs_; i++) + enc_reflCoefs_[i] = refCs[i]; + enc_Energy_ = outEnergy; + } else { + /* Average history with new values. */ + for (i = 0; i < enc_nrOfCoefs_; i++) { + enc_reflCoefs_[i] = + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(enc_reflCoefs_[i], ReflBeta, 15); + enc_reflCoefs_[i] += + (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(refCs[i], ReflBetaComp, 15); + } + enc_Energy_ = (outEnergy >> 2) + (enc_Energy_ >> 1) + (enc_Energy_ >> 2); + } + + if (enc_Energy_ < 1) { + enc_Energy_ = 1; + } + + if ((enc_msSinceSid_ > (enc_interval_ - 1)) || force_sid) { + /* Search for best dbov value. */ + index = 0; + for (i = 1; i < 93; i++) { + /* Always round downwards. */ + if ((enc_Energy_ - WebRtcCng_kDbov[i]) > 0) { + index = i; + break; + } + } + if ((i == 93) && (index == 0)) + index = 94; + + const size_t output_coefs = enc_nrOfCoefs_ + 1; + output->AppendData(output_coefs, [&](rtc::ArrayView<uint8_t> output) { + output[0] = (uint8_t)index; + + /* Quantize coefficients with tweak for WebRtc implementation of + * RFC3389. */ + if (enc_nrOfCoefs_ == WEBRTC_CNG_MAX_LPC_ORDER) { + for (i = 0; i < enc_nrOfCoefs_; i++) { + /* Q15 to Q7 with rounding. */ + output[i + 1] = ((enc_reflCoefs_[i] + 128) >> 8); + } + } else { + for (i = 0; i < enc_nrOfCoefs_; i++) { + /* Q15 to Q7 with rounding. */ + output[i + 1] = (127 + ((enc_reflCoefs_[i] + 128) >> 8)); + } + } + + return output_coefs; + }); + + enc_msSinceSid_ = + static_cast<int16_t>((1000 * num_samples) / enc_sampfreq_); + return output_coefs; + } else { + enc_msSinceSid_ += + static_cast<int16_t>((1000 * num_samples) / enc_sampfreq_); + return 0; + } +} + +namespace { +/* Values in `k` are Q15, and `a` Q12. */ +void WebRtcCng_K2a16(int16_t* k, int useOrder, int16_t* a) { + int16_t any[WEBRTC_SPL_MAX_LPC_ORDER + 1]; + int16_t* aptr; + int16_t* aptr2; + int16_t* anyptr; + const int16_t* kptr; + int m, i; + + kptr = k; + *a = 4096; /* i.e., (Word16_MAX >> 3) + 1 */ + *any = *a; + a[1] = (*k + 4) >> 3; + for (m = 1; m < useOrder; m++) { + kptr++; + aptr = a; + aptr++; + aptr2 = &a[m]; + anyptr = any; + anyptr++; + + any[m + 1] = (*kptr + 4) >> 3; + for (i = 0; i < m; i++) { + *anyptr++ = + (*aptr++) + + (int16_t)((((int32_t)(*aptr2--) * (int32_t)*kptr) + 16384) >> 15); + } + + aptr = a; + anyptr = any; + for (i = 0; i < (m + 2); i++) { + *aptr++ = *anyptr++; + } + } +} + +} // namespace + +} // namespace webrtc diff --git a/third_party/libwebrtc/modules/audio_coding/codecs/cng/webrtc_cng.h b/third_party/libwebrtc/modules/audio_coding/codecs/cng/webrtc_cng.h new file mode 100644 index 0000000000..7afd243f81 --- /dev/null +++ b/third_party/libwebrtc/modules/audio_coding/codecs/cng/webrtc_cng.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_CODING_CODECS_CNG_WEBRTC_CNG_H_ +#define MODULES_AUDIO_CODING_CODECS_CNG_WEBRTC_CNG_H_ + +#include <stdint.h> + +#include <cstddef> + +#include "api/array_view.h" +#include "rtc_base/buffer.h" + +#define WEBRTC_CNG_MAX_LPC_ORDER 12 + +namespace webrtc { + +class ComfortNoiseDecoder { + public: + ComfortNoiseDecoder(); + ~ComfortNoiseDecoder() = default; + + ComfortNoiseDecoder(const ComfortNoiseDecoder&) = delete; + ComfortNoiseDecoder& operator=(const ComfortNoiseDecoder&) = delete; + + void Reset(); + + // Updates the CN state when a new SID packet arrives. + // `sid` is a view of the SID packet without the headers. + void UpdateSid(rtc::ArrayView<const uint8_t> sid); + + // Generates comfort noise. + // `out_data` will be filled with samples - its size determines the number of + // samples generated. When `new_period` is true, CNG history will be reset + // before any audio is generated. Returns `false` if outData is too large - + // currently 640 bytes (equalling 10ms at 64kHz). + // TODO(ossu): Specify better limits for the size of out_data. Either let it + // be unbounded or limit to 10ms in the current sample rate. + bool Generate(rtc::ArrayView<int16_t> out_data, bool new_period); + + private: + uint32_t dec_seed_; + int32_t dec_target_energy_; + int32_t dec_used_energy_; + int16_t dec_target_reflCoefs_[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int16_t dec_used_reflCoefs_[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int16_t dec_filtstate_[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int16_t dec_filtstateLow_[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + uint16_t dec_order_; + int16_t dec_target_scale_factor_; /* Q29 */ + int16_t dec_used_scale_factor_; /* Q29 */ +}; + +class ComfortNoiseEncoder { + public: + // Creates a comfort noise encoder. + // `fs` selects sample rate: 8000 for narrowband or 16000 for wideband. + // `interval` sets the interval at which to generate SID data (in ms). + // `quality` selects the number of refl. coeffs. Maximum allowed is 12. + ComfortNoiseEncoder(int fs, int interval, int quality); + ~ComfortNoiseEncoder() = default; + + ComfortNoiseEncoder(const ComfortNoiseEncoder&) = delete; + ComfortNoiseEncoder& operator=(const ComfortNoiseEncoder&) = delete; + + // Resets the comfort noise encoder to its initial state. + // Parameters are set as during construction. + void Reset(int fs, int interval, int quality); + + // Analyzes background noise from `speech` and appends coefficients to + // `output`. Returns the number of coefficients generated. If `force_sid` is + // true, a SID frame is forced and the internal sid interval counter is reset. + // Will fail if the input size is too large (> 640 samples, see + // ComfortNoiseDecoder::Generate). + size_t Encode(rtc::ArrayView<const int16_t> speech, + bool force_sid, + rtc::Buffer* output); + + private: + size_t enc_nrOfCoefs_; + int enc_sampfreq_; + int16_t enc_interval_; + int16_t enc_msSinceSid_; + int32_t enc_Energy_; + int16_t enc_reflCoefs_[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + int32_t enc_corrVector_[WEBRTC_CNG_MAX_LPC_ORDER + 1]; + uint32_t enc_seed_; +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_CODING_CODECS_CNG_WEBRTC_CNG_H_ |