/* * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "audio/utility/channel_mixer.h" #include #include "api/audio/audio_frame.h" #include "api/audio/channel_layout.h" #include "audio/utility/channel_mixing_matrix.h" #include "rtc_base/arraysize.h" #include "rtc_base/strings/string_builder.h" #include "test/gtest.h" namespace webrtc { namespace { constexpr uint32_t kTimestamp = 27; constexpr int kSampleRateHz = 16000; constexpr size_t kSamplesPerChannel = kSampleRateHz / 100; class ChannelMixerTest : public ::testing::Test { protected: ChannelMixerTest() { // Use 10ms audio frames by default. Don't set values yet. frame_.samples_per_channel_ = kSamplesPerChannel; frame_.sample_rate_hz_ = kSampleRateHz; EXPECT_TRUE(frame_.muted()); } virtual ~ChannelMixerTest() {} AudioFrame frame_; }; void SetFrameData(int16_t data, AudioFrame* frame) { int16_t* frame_data = frame->mutable_data(); for (size_t i = 0; i < frame->samples_per_channel() * frame->num_channels(); i++) { frame_data[i] = data; } } void SetMonoData(int16_t center, AudioFrame* frame) { frame->num_channels_ = 1; int16_t* frame_data = frame->mutable_data(); for (size_t i = 0; i < frame->samples_per_channel(); ++i) { frame_data[i] = center; } EXPECT_FALSE(frame->muted()); } void SetStereoData(int16_t left, int16_t right, AudioFrame* frame) { ASSERT_LE(2 * frame->samples_per_channel(), frame->max_16bit_samples()); frame->num_channels_ = 2; int16_t* frame_data = frame->mutable_data(); for (size_t i = 0; i < frame->samples_per_channel() * 2; i += 2) { frame_data[i] = left; frame_data[i + 1] = right; } EXPECT_FALSE(frame->muted()); } void SetFiveOneData(int16_t front_left, int16_t front_right, int16_t center, int16_t lfe, int16_t side_left, int16_t side_right, AudioFrame* frame) { ASSERT_LE(6 * frame->samples_per_channel(), frame->max_16bit_samples()); frame->num_channels_ = 6; int16_t* frame_data = frame->mutable_data(); for (size_t i = 0; i < frame->samples_per_channel() * 6; i += 6) { frame_data[i] = front_left; frame_data[i + 1] = front_right; frame_data[i + 2] = center; frame_data[i + 3] = lfe; frame_data[i + 4] = side_left; frame_data[i + 5] = side_right; } EXPECT_FALSE(frame->muted()); } void SetSevenOneData(int16_t front_left, int16_t front_right, int16_t center, int16_t lfe, int16_t side_left, int16_t side_right, int16_t back_left, int16_t back_right, AudioFrame* frame) { ASSERT_LE(8 * frame->samples_per_channel(), frame->max_16bit_samples()); frame->num_channels_ = 8; int16_t* frame_data = frame->mutable_data(); for (size_t i = 0; i < frame->samples_per_channel() * 8; i += 8) { frame_data[i] = front_left; frame_data[i + 1] = front_right; frame_data[i + 2] = center; frame_data[i + 3] = lfe; frame_data[i + 4] = side_left; frame_data[i + 5] = side_right; frame_data[i + 6] = back_left; frame_data[i + 7] = back_right; } EXPECT_FALSE(frame->muted()); } bool AllSamplesEquals(int16_t sample, const AudioFrame* frame) { const int16_t* frame_data = frame->data(); for (size_t i = 0; i < frame->samples_per_channel() * frame->num_channels(); i++) { if (frame_data[i] != sample) { return false; } } return true; } void VerifyFramesAreEqual(const AudioFrame& frame1, const AudioFrame& frame2) { EXPECT_EQ(frame1.num_channels(), frame2.num_channels()); EXPECT_EQ(frame1.samples_per_channel(), frame2.samples_per_channel()); const int16_t* frame1_data = frame1.data(); const int16_t* frame2_data = frame2.data(); for (size_t i = 0; i < frame1.samples_per_channel() * frame1.num_channels(); i++) { EXPECT_EQ(frame1_data[i], frame2_data[i]); } EXPECT_EQ(frame1.muted(), frame2.muted()); } } // namespace // Test all possible layout conversions can be constructed and mixed. Don't // care about the actual content, simply run through all mixing combinations // and ensure that nothing fails. TEST_F(ChannelMixerTest, ConstructAllPossibleLayouts) { for (ChannelLayout input_layout = CHANNEL_LAYOUT_MONO; input_layout <= CHANNEL_LAYOUT_MAX; input_layout = static_cast(input_layout + 1)) { for (ChannelLayout output_layout = CHANNEL_LAYOUT_MONO; output_layout <= CHANNEL_LAYOUT_MAX; output_layout = static_cast(output_layout + 1)) { // DISCRETE, BITSTREAM can't be tested here based on the current approach. // CHANNEL_LAYOUT_STEREO_AND_KEYBOARD_MIC is not mixable. // Stereo down mix should never be the output layout. if (input_layout == CHANNEL_LAYOUT_BITSTREAM || input_layout == CHANNEL_LAYOUT_DISCRETE || input_layout == CHANNEL_LAYOUT_STEREO_AND_KEYBOARD_MIC || output_layout == CHANNEL_LAYOUT_BITSTREAM || output_layout == CHANNEL_LAYOUT_DISCRETE || output_layout == CHANNEL_LAYOUT_STEREO_AND_KEYBOARD_MIC || output_layout == CHANNEL_LAYOUT_STEREO_DOWNMIX) { continue; } rtc::StringBuilder ss; ss << "Input Layout: " << input_layout << ", Output Layout: " << output_layout; SCOPED_TRACE(ss.str()); ChannelMixer mixer(input_layout, output_layout); frame_.UpdateFrame(kTimestamp, nullptr, kSamplesPerChannel, kSampleRateHz, AudioFrame::kNormalSpeech, AudioFrame::kVadActive, ChannelLayoutToChannelCount(input_layout)); EXPECT_TRUE(frame_.muted()); mixer.Transform(&frame_); } } } // Ensure that the audio frame is untouched when input and output channel // layouts are identical, i.e., the transformation should have no effect. // Exclude invalid mixing combinations. TEST_F(ChannelMixerTest, NoMixingForIdenticalChannelLayouts) { for (ChannelLayout input_layout = CHANNEL_LAYOUT_MONO; input_layout <= CHANNEL_LAYOUT_MAX; input_layout = static_cast(input_layout + 1)) { for (ChannelLayout output_layout = CHANNEL_LAYOUT_MONO; output_layout <= CHANNEL_LAYOUT_MAX; output_layout = static_cast(output_layout + 1)) { if (input_layout != output_layout || input_layout == CHANNEL_LAYOUT_BITSTREAM || input_layout == CHANNEL_LAYOUT_DISCRETE || input_layout == CHANNEL_LAYOUT_STEREO_AND_KEYBOARD_MIC || output_layout == CHANNEL_LAYOUT_STEREO_DOWNMIX) { continue; } ChannelMixer mixer(input_layout, output_layout); frame_.num_channels_ = ChannelLayoutToChannelCount(input_layout); SetFrameData(99, &frame_); mixer.Transform(&frame_); EXPECT_EQ(ChannelLayoutToChannelCount(input_layout), static_cast(frame_.num_channels())); EXPECT_TRUE(AllSamplesEquals(99, &frame_)); } } } TEST_F(ChannelMixerTest, StereoToMono) { ChannelMixer mixer(CHANNEL_LAYOUT_STEREO, CHANNEL_LAYOUT_MONO); // // Input: stereo // LEFT RIGHT // Output: mono CENTER 0.5 0.5 // SetStereoData(7, 3, &frame_); EXPECT_EQ(2u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(1u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_MONO, frame_.channel_layout()); AudioFrame mono_frame; mono_frame.samples_per_channel_ = frame_.samples_per_channel(); SetMonoData(5, &mono_frame); VerifyFramesAreEqual(mono_frame, frame_); SetStereoData(-32768, -32768, &frame_); EXPECT_EQ(2u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(1u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_MONO, frame_.channel_layout()); SetMonoData(-32768, &mono_frame); VerifyFramesAreEqual(mono_frame, frame_); } TEST_F(ChannelMixerTest, StereoToMonoMuted) { ASSERT_TRUE(frame_.muted()); ChannelMixer mixer(CHANNEL_LAYOUT_STEREO, CHANNEL_LAYOUT_MONO); mixer.Transform(&frame_); EXPECT_EQ(1u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_MONO, frame_.channel_layout()); EXPECT_TRUE(frame_.muted()); } TEST_F(ChannelMixerTest, FiveOneToSevenOneMuted) { ASSERT_TRUE(frame_.muted()); ChannelMixer mixer(CHANNEL_LAYOUT_5_1, CHANNEL_LAYOUT_7_1); mixer.Transform(&frame_); EXPECT_EQ(8u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_7_1, frame_.channel_layout()); EXPECT_TRUE(frame_.muted()); } TEST_F(ChannelMixerTest, FiveOneToMono) { ChannelMixer mixer(CHANNEL_LAYOUT_5_1, CHANNEL_LAYOUT_MONO); // // Input: 5.1 // LEFT RIGHT CENTER LFE SIDE_LEFT SIDE_RIGHT // Output: mono CENTER 0.707 0.707 1 0.707 0.707 0.707 // // a = [10, 20, 15, 2, 5, 5] // b = [1/sqrt(2), 1/sqrt(2), 1.0, 1/sqrt(2), 1/sqrt(2), 1/sqrt(2)] => // a * b (dot product) = 44.69848480983499, // which is truncated into 44 using 16 bit representation. // SetFiveOneData(10, 20, 15, 2, 5, 5, &frame_); EXPECT_EQ(6u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(1u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_MONO, frame_.channel_layout()); AudioFrame mono_frame; mono_frame.samples_per_channel_ = frame_.samples_per_channel(); SetMonoData(44, &mono_frame); VerifyFramesAreEqual(mono_frame, frame_); SetFiveOneData(-32768, -32768, -32768, -32768, -32768, -32768, &frame_); EXPECT_EQ(6u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(1u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_MONO, frame_.channel_layout()); SetMonoData(-32768, &mono_frame); VerifyFramesAreEqual(mono_frame, frame_); } TEST_F(ChannelMixerTest, FiveOneToSevenOne) { ChannelMixer mixer(CHANNEL_LAYOUT_5_1, CHANNEL_LAYOUT_7_1); // // Input: 5.1 // LEFT RIGHT CENTER LFE SIDE_LEFT SIDE_RIGHT // Output: 7.1 LEFT 1 0 0 0 0 0 // RIGHT 0 1 0 0 0 0 // CENTER 0 0 1 0 0 0 // LFE 0 0 0 1 0 0 // SIDE_LEFT 0 0 0 0 1 0 // SIDE_RIGHT 0 0 0 0 0 1 // BACK_LEFT 0 0 0 0 0 0 // BACK_RIGHT 0 0 0 0 0 0 // SetFiveOneData(10, 20, 15, 2, 5, 5, &frame_); EXPECT_EQ(6u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(8u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_7_1, frame_.channel_layout()); AudioFrame seven_one_frame; seven_one_frame.samples_per_channel_ = frame_.samples_per_channel(); SetSevenOneData(10, 20, 15, 2, 5, 5, 0, 0, &seven_one_frame); VerifyFramesAreEqual(seven_one_frame, frame_); SetFiveOneData(-32768, 32767, -32768, 32767, -32768, 32767, &frame_); EXPECT_EQ(6u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(8u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_7_1, frame_.channel_layout()); SetSevenOneData(-32768, 32767, -32768, 32767, -32768, 32767, 0, 0, &seven_one_frame); VerifyFramesAreEqual(seven_one_frame, frame_); } TEST_F(ChannelMixerTest, FiveOneBackToStereo) { ChannelMixer mixer(CHANNEL_LAYOUT_5_1_BACK, CHANNEL_LAYOUT_STEREO); // // Input: 5.1 // LEFT RIGHT CENTER LFE BACK_LEFT BACK_RIGHT // Output: stereo LEFT 1 0 0.707 0.707 0.707 0 // RIGHT 0 1 0.707 0.707 0 0.707 // SetFiveOneData(20, 30, 15, 2, 5, 5, &frame_); EXPECT_EQ(6u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(2u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_STEREO, frame_.channel_layout()); AudioFrame stereo_frame; stereo_frame.samples_per_channel_ = frame_.samples_per_channel(); SetStereoData(35, 45, &stereo_frame); VerifyFramesAreEqual(stereo_frame, frame_); SetFiveOneData(-32768, -32768, -32768, -32768, -32768, -32768, &frame_); EXPECT_EQ(6u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(2u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_STEREO, frame_.channel_layout()); SetStereoData(-32768, -32768, &stereo_frame); VerifyFramesAreEqual(stereo_frame, frame_); } TEST_F(ChannelMixerTest, MonoToStereo) { ChannelMixer mixer(CHANNEL_LAYOUT_MONO, CHANNEL_LAYOUT_STEREO); // // Input: mono // CENTER // Output: stereo LEFT 1 // RIGHT 1 // SetMonoData(44, &frame_); EXPECT_EQ(1u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(2u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_STEREO, frame_.channel_layout()); AudioFrame stereo_frame; stereo_frame.samples_per_channel_ = frame_.samples_per_channel(); SetStereoData(44, 44, &stereo_frame); VerifyFramesAreEqual(stereo_frame, frame_); } TEST_F(ChannelMixerTest, StereoToFiveOne) { ChannelMixer mixer(CHANNEL_LAYOUT_STEREO, CHANNEL_LAYOUT_5_1); // // Input: Stereo // LEFT RIGHT // Output: 5.1 LEFT 1 0 // RIGHT 0 1 // CENTER 0 0 // LFE 0 0 // SIDE_LEFT 0 0 // SIDE_RIGHT 0 0 // SetStereoData(50, 60, &frame_); EXPECT_EQ(2u, frame_.num_channels()); mixer.Transform(&frame_); EXPECT_EQ(6u, frame_.num_channels()); EXPECT_EQ(CHANNEL_LAYOUT_5_1, frame_.channel_layout()); AudioFrame five_one_frame; five_one_frame.samples_per_channel_ = frame_.samples_per_channel(); SetFiveOneData(50, 60, 0, 0, 0, 0, &five_one_frame); VerifyFramesAreEqual(five_one_frame, frame_); } } // namespace webrtc