/* * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "modules/audio_processing/test/conversational_speech/multiend_call.h" #include #include #include "absl/strings/string_view.h" #include "rtc_base/logging.h" #include "test/testsupport/file_utils.h" namespace webrtc { namespace test { namespace conversational_speech { MultiEndCall::MultiEndCall( rtc::ArrayView timing, absl::string_view audiotracks_path, std::unique_ptr wavreader_abstract_factory) : timing_(timing), audiotracks_path_(audiotracks_path), wavreader_abstract_factory_(std::move(wavreader_abstract_factory)), valid_(false) { FindSpeakerNames(); if (CreateAudioTrackReaders()) valid_ = CheckTiming(); } MultiEndCall::~MultiEndCall() = default; void MultiEndCall::FindSpeakerNames() { RTC_DCHECK(speaker_names_.empty()); for (const Turn& turn : timing_) { speaker_names_.emplace(turn.speaker_name); } } bool MultiEndCall::CreateAudioTrackReaders() { RTC_DCHECK(audiotrack_readers_.empty()); sample_rate_hz_ = 0; // Sample rate will be set when reading the first track. for (const Turn& turn : timing_) { auto it = audiotrack_readers_.find(turn.audiotrack_file_name); if (it != audiotrack_readers_.end()) continue; const std::string audiotrack_file_path = test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name); // Map the audiotrack file name to a new instance of WavReaderInterface. std::unique_ptr wavreader = wavreader_abstract_factory_->Create( test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name)); if (sample_rate_hz_ == 0) { sample_rate_hz_ = wavreader->SampleRate(); } else if (sample_rate_hz_ != wavreader->SampleRate()) { RTC_LOG(LS_ERROR) << "All the audio tracks should have the same sample rate."; return false; } if (wavreader->NumChannels() != 1) { RTC_LOG(LS_ERROR) << "Only mono audio tracks supported."; return false; } audiotrack_readers_.emplace(turn.audiotrack_file_name, std::move(wavreader)); } return true; } bool MultiEndCall::CheckTiming() { struct Interval { size_t begin; size_t end; }; size_t number_of_turns = timing_.size(); auto millisecond_to_samples = [](int ms, int sr) -> int { // Truncation may happen if the sampling rate is not an integer multiple // of 1000 (e.g., 44100). return ms * sr / 1000; }; auto in_interval = [](size_t value, const Interval& interval) { return interval.begin <= value && value < interval.end; }; total_duration_samples_ = 0; speaking_turns_.clear(); // Begin and end timestamps for the last two turns (unit: number of samples). Interval second_last_turn = {0, 0}; Interval last_turn = {0, 0}; // Initialize map to store speaking turn indices of each speaker (used to // detect self cross-talk). std::map> speaking_turn_indices; for (const std::string& speaker_name : speaker_names_) { speaking_turn_indices.emplace(std::piecewise_construct, std::forward_as_tuple(speaker_name), std::forward_as_tuple()); } // Parse turns. for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { const Turn& turn = timing_[turn_index]; auto it = audiotrack_readers_.find(turn.audiotrack_file_name); RTC_CHECK(it != audiotrack_readers_.end()) << "Audio track reader not created"; // Begin and end timestamps for the current turn. int offset_samples = millisecond_to_samples(turn.offset, it->second->SampleRate()); std::size_t begin_timestamp = last_turn.end + offset_samples; std::size_t end_timestamp = begin_timestamp + it->second->NumSamples(); RTC_LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp << "-" << end_timestamp << " ms"; // The order is invalid if the offset is negative and its absolute value is // larger then the duration of the previous turn. if (offset_samples < 0 && -offset_samples > static_cast(last_turn.end - last_turn.begin)) { RTC_LOG(LS_ERROR) << "invalid order"; return false; } // Cross-talk with 3 or more speakers occurs when the beginning of the // current interval falls in the last two turns. if (turn_index > 1 && in_interval(begin_timestamp, last_turn) && in_interval(begin_timestamp, second_last_turn)) { RTC_LOG(LS_ERROR) << "cross-talk with 3+ speakers"; return false; } // Append turn. speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name, begin_timestamp, end_timestamp, turn.gain); // Save speaking turn index for self cross-talk detection. RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1); speaking_turn_indices[turn.speaker_name].push_back(turn_index); // Update total duration of the consversational speech. if (total_duration_samples_ < end_timestamp) total_duration_samples_ = end_timestamp; // Update and continue with next turn. second_last_turn = last_turn; last_turn.begin = begin_timestamp; last_turn.end = end_timestamp; } // Detect self cross-talk. for (const std::string& speaker_name : speaker_names_) { RTC_LOG(LS_INFO) << "checking self cross-talk for <" << speaker_name << ">"; // Copy all turns for this speaker to new vector. std::vector speaking_turns_for_name; std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), std::back_inserter(speaking_turns_for_name), [&speaker_name](const SpeakingTurn& st) { return st.speaker_name == speaker_name; }); // Check for overlap between adjacent elements. // This is a sufficient condition for self cross-talk since the intervals // are sorted by begin timestamp. auto overlap = std::adjacent_find( speaking_turns_for_name.begin(), speaking_turns_for_name.end(), [](const SpeakingTurn& a, const SpeakingTurn& b) { return a.end > b.begin; }); if (overlap != speaking_turns_for_name.end()) { RTC_LOG(LS_ERROR) << "Self cross-talk detected"; return false; } } return true; } } // namespace conversational_speech } // namespace test } // namespace webrtc