/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:set ts=2 sw=2 sts=2 et cindent: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "AudioSink.h" #include "AudioConverter.h" #include "AudioDeviceInfo.h" #include "MediaQueue.h" #include "VideoUtils.h" #include "mozilla/CheckedInt.h" #include "mozilla/DebugOnly.h" #include "mozilla/IntegerPrintfMacros.h" #include "mozilla/ProfilerMarkerTypes.h" #include "mozilla/StaticPrefs_media.h" #include "mozilla/StaticPrefs_dom.h" #include "nsPrintfCString.h" #include "Tracing.h" namespace mozilla { mozilla::LazyLogModule gAudioSinkLog("AudioSink"); #define SINK_LOG(msg, ...) \ MOZ_LOG(gAudioSinkLog, LogLevel::Debug, \ ("AudioSink=%p " msg, this, ##__VA_ARGS__)) #define SINK_LOG_V(msg, ...) \ MOZ_LOG(gAudioSinkLog, LogLevel::Verbose, \ ("AudioSink=%p " msg, this, ##__VA_ARGS__)) // The amount of audio frames that is used to fuzz rounding errors. static const int64_t AUDIO_FUZZ_FRAMES = 1; using media::TimeUnit; AudioSink::AudioSink(AbstractThread* aThread, MediaQueue& aAudioQueue, const AudioInfo& aInfo, bool aShouldResistFingerprinting) : mPlaying(true), mWritten(0), mErrored(false), mOwnerThread(aThread), mFramesParsed(0), mOutputRate( DecideAudioPlaybackSampleRate(aInfo, aShouldResistFingerprinting)), mOutputChannels(DecideAudioPlaybackChannels(aInfo)), mAudibilityMonitor( mOutputRate, StaticPrefs::dom_media_silence_duration_for_audibility()), mIsAudioDataAudible(false), mProcessedQueueFinished(false), mAudioQueue(aAudioQueue), mProcessedQueueThresholdMS( StaticPrefs::media_audio_audiosink_threshold_ms()) { // Not much to initialize here if there's no audio. if (!aInfo.IsValid()) { mProcessedSPSCQueue = MakeUnique>(0); return; } // Twice the limit that trigger a refill. double capacitySeconds = mProcessedQueueThresholdMS / 1000.f * 2; // Clamp to correct boundaries, and align on the channel count int elementCount = static_cast( std::clamp(capacitySeconds * mOutputChannels * mOutputRate, 0., std::numeric_limits::max() - 1.)); elementCount -= elementCount % mOutputChannels; mProcessedSPSCQueue = MakeUnique>(elementCount); SINK_LOG("Ringbuffer has space for %u elements (%lf seconds)", mProcessedSPSCQueue->Capacity(), static_cast(elementCount) / mOutputChannels / mOutputRate); // Determine if the data is likely to be audible when the stream will be // ready, if possible. RefPtr frontPacket = mAudioQueue.PeekFront(); if (frontPacket) { mAudibilityMonitor.ProcessInterleaved(frontPacket->Data(), frontPacket->mChannels); mIsAudioDataAudible = mAudibilityMonitor.RecentlyAudible(); SINK_LOG("New AudioSink -- audio is likely to be %s", mIsAudioDataAudible ? "audible" : "inaudible"); } else { // If no packets are available, consider the audio audible. mIsAudioDataAudible = true; SINK_LOG( "New AudioSink -- no audio packet avaialble, considering the stream " "audible"); } } AudioSink::~AudioSink() { // Generally instances of AudioSink should be properly Shutdown manually. // The only way deleting an AudioSink without shutdown an happen is if the // dispatch back to the MDSM thread after initializing it asynchronously // fails. When that's the case, the stream has been initialized but not // started. Manually shutdown the AudioStream in this case. if (mAudioStream) { mAudioStream->Shutdown(); } } nsresult AudioSink::InitializeAudioStream( const PlaybackParams& aParams, const RefPtr& aAudioDevice, AudioSink::InitializationType aInitializationType) { if (aInitializationType == AudioSink::InitializationType::UNMUTING) { // Consider the stream to be audible immediately, before initialization // finishes when unmuting, in case initialization takes some time and it // looked audible when the AudioSink was created. mAudibleEvent.Notify(mIsAudioDataAudible); SINK_LOG("InitializeAudioStream (Unmuting) notifying that audio is %s", mIsAudioDataAudible ? "audible" : "inaudible"); } else { // If not unmuting, the audibility event will be dispatched as usual, // inspecting the audio content as it's being played and signaling the // audibility event when a different in state is detected. SINK_LOG("InitializeAudioStream (initial)"); mIsAudioDataAudible = false; } // When AudioQueue is empty, there is no way to know the channel layout of // the coming audio data, so we use the predefined channel map instead. AudioConfig::ChannelLayout::ChannelMap channelMap = AudioConfig::ChannelLayout(mOutputChannels).Map(); // The layout map used here is already processed by mConverter with // mOutputChannels into SMPTE format, so there is no need to worry if // StaticPrefs::accessibility_monoaudio_enable() or // StaticPrefs::media_forcestereo_enabled() is applied. MOZ_ASSERT(!mAudioStream); mAudioStream = new AudioStream(*this, mOutputRate, mOutputChannels, channelMap); nsresult rv = mAudioStream->Init(aAudioDevice); if (NS_FAILED(rv)) { mAudioStream->Shutdown(); mAudioStream = nullptr; return rv; } // Set playback params before calling Start() so they can take effect // as soon as the 1st DataCallback of the AudioStream fires. mAudioStream->SetVolume(aParams.mVolume); mAudioStream->SetPlaybackRate(aParams.mPlaybackRate); mAudioStream->SetPreservesPitch(aParams.mPreservesPitch); return NS_OK; } nsresult AudioSink::Start( const media::TimeUnit& aStartTime, MozPromiseHolder& aEndedPromise) { MOZ_ASSERT(mOwnerThread->IsCurrentThreadIn()); mAudioQueueListener = mAudioQueue.PushEvent().Connect( mOwnerThread, this, &AudioSink::OnAudioPushed); mAudioQueueFinishListener = mAudioQueue.FinishEvent().Connect( mOwnerThread, this, &AudioSink::NotifyAudioNeeded); mProcessedQueueListener = mAudioPopped.Connect(mOwnerThread, this, &AudioSink::OnAudioPopped); mStartTime = aStartTime; // To ensure at least one audio packet will be popped from AudioQueue and // ready to be played. NotifyAudioNeeded(); return mAudioStream->Start(aEndedPromise); } TimeUnit AudioSink::GetPosition() { int64_t tmp; if (mAudioStream && (tmp = mAudioStream->GetPosition()) >= 0) { TimeUnit pos = TimeUnit::FromMicroseconds(tmp); NS_ASSERTION(pos >= mLastGoodPosition, "AudioStream position shouldn't go backward"); TimeUnit tmp = mStartTime + pos; if (!tmp.IsValid()) { mErrored = true; return mStartTime + mLastGoodPosition; } // Update the last good position when we got a good one. if (pos >= mLastGoodPosition) { mLastGoodPosition = pos; } } return mStartTime + mLastGoodPosition; } bool AudioSink::HasUnplayedFrames() { // Experimentation suggests that GetPositionInFrames() is zero-indexed, // so we need to add 1 here before comparing it to mWritten. return mProcessedSPSCQueue->AvailableRead() || (mAudioStream && mAudioStream->GetPositionInFrames() + 1 < mWritten); } TimeUnit AudioSink::UnplayedDuration() const { return TimeUnit::FromMicroseconds(AudioQueuedInRingBufferMS()); } void AudioSink::ReenqueueUnplayedAudioDataIfNeeded() { // This is OK: the AudioStream has been shut down. Shutdown guarantees that // the audio callback thread won't call back again. mProcessedSPSCQueue->ResetThreadIds(); // construct an AudioData int sampleInRingbuffer = mProcessedSPSCQueue->AvailableRead(); if (!sampleInRingbuffer) { return; } uint32_t channelCount; uint32_t rate; if (mConverter) { channelCount = mConverter->OutputConfig().Channels(); rate = mConverter->OutputConfig().Rate(); } else { channelCount = mOutputChannels; rate = mOutputRate; } uint32_t framesRemaining = sampleInRingbuffer / channelCount; nsTArray packetsToReenqueue; RefPtr frontPacket = mAudioQueue.PeekFront(); uint32_t offset; TimeUnit time; uint32_t typicalPacketFrameCount; // Extrapolate mOffset, mTime from the front of the queue // We can't really find a good value for `mOffset`, so we take what we have // at the front of the queue. // For `mTime`, assume there hasn't been a discontinuity recently. if (!frontPacket) { // We do our best here, but it's not going to be perfect. typicalPacketFrameCount = 1024; // typical for e.g. AAC offset = 0; time = GetPosition(); } else { typicalPacketFrameCount = frontPacket->Frames(); offset = frontPacket->mOffset; time = frontPacket->mTime; } // Extract all audio data from the ring buffer, we can only read the data from // the most recent, so we reenqueue the data, packetized, in a temporary // array. while (framesRemaining) { uint32_t packetFrameCount = std::min(framesRemaining, typicalPacketFrameCount); framesRemaining -= packetFrameCount; int packetSampleCount = packetFrameCount * channelCount; AlignedAudioBuffer packetData(packetSampleCount); DebugOnly samplesRead = mProcessedSPSCQueue->Dequeue(packetData.Data(), packetSampleCount); MOZ_ASSERT(samplesRead == packetSampleCount); packetsToReenqueue.AppendElement(packetData); } // Reenqueue in the audio queue in correct order in the audio queue, starting // with the end of the temporary array. while (!packetsToReenqueue.IsEmpty()) { auto packetData = packetsToReenqueue.PopLastElement(); uint32_t packetFrameCount = packetData.Length() / channelCount; auto duration = TimeUnit(packetFrameCount, rate); if (!duration.IsValid()) { NS_WARNING("Int overflow in AudioSink"); mErrored = true; return; } time -= duration; RefPtr packet = new AudioData(offset, time, std::move(packetData), channelCount, rate); MOZ_DIAGNOSTIC_ASSERT(duration == packet->mDuration, "must be equal"); SINK_LOG( "Muting: Pushing back %u frames (%lfms) from the ring buffer back into " "the audio queue at pts %lf", packetFrameCount, 1000 * static_cast(packetFrameCount) / rate, time.ToSeconds()); // The audio data's timestamp would be adjusted already if we're in looping, // so we don't want to adjust them again. mAudioQueue.PushFront(packet, MediaQueue::TimestampAdjustment::Disable); } } Maybe> AudioSink::Shutdown( ShutdownCause aShutdownCause) { MOZ_ASSERT(mOwnerThread->IsCurrentThreadIn()); mAudioQueueListener.DisconnectIfExists(); mAudioQueueFinishListener.DisconnectIfExists(); mProcessedQueueListener.DisconnectIfExists(); Maybe> rv; if (mAudioStream) { rv = mAudioStream->Shutdown(aShutdownCause); mAudioStream = nullptr; if (aShutdownCause == ShutdownCause::Muting) { ReenqueueUnplayedAudioDataIfNeeded(); } } mProcessedQueueFinished = true; return rv; } void AudioSink::SetVolume(double aVolume) { if (mAudioStream) { mAudioStream->SetVolume(aVolume); } } void AudioSink::SetStreamName(const nsAString& aStreamName) { if (mAudioStream) { mAudioStream->SetStreamName(aStreamName); } } void AudioSink::SetPlaybackRate(double aPlaybackRate) { MOZ_ASSERT(aPlaybackRate != 0, "Don't set the playbackRate to 0 on AudioStream"); if (mAudioStream) { mAudioStream->SetPlaybackRate(aPlaybackRate); } } void AudioSink::SetPreservesPitch(bool aPreservesPitch) { if (mAudioStream) { mAudioStream->SetPreservesPitch(aPreservesPitch); } } void AudioSink::SetPlaying(bool aPlaying) { if (!mAudioStream || mAudioStream->IsPlaybackCompleted() || mPlaying == aPlaying) { return; } // pause/resume AudioStream as necessary. if (!aPlaying) { mAudioStream->Pause(); } else if (aPlaying) { mAudioStream->Resume(); } mPlaying = aPlaying; } TimeUnit AudioSink::GetEndTime() const { uint64_t written = mWritten; TimeUnit played = media::TimeUnit(written, mOutputRate) + mStartTime; if (!played.IsValid()) { NS_WARNING("Int overflow calculating audio end time"); return TimeUnit::Zero(); } // As we may be resampling, rounding errors may occur. Ensure we never get // past the original end time. return std::min(mLastEndTime, played); } uint32_t AudioSink::PopFrames(AudioDataValue* aBuffer, uint32_t aFrames, bool aAudioThreadChanged) { // This is safe, because we have the guarantee, by the OS, that audio // callbacks are never called concurrently. Audio thread changes can only // happen when not using cubeb remoting, and often when changing audio device // at the system level. if (aAudioThreadChanged) { mProcessedSPSCQueue->ResetThreadIds(); } TRACE_COMMENT("AudioSink::PopFrames", "%u frames (ringbuffer: %u/%u)", aFrames, SampleToFrame(mProcessedSPSCQueue->AvailableRead()), SampleToFrame(mProcessedSPSCQueue->Capacity())); const int samplesToPop = static_cast(aFrames * mOutputChannels); const int samplesRead = mProcessedSPSCQueue->Dequeue(aBuffer, samplesToPop); auto sampleOut = samplesRead; MOZ_ASSERT(samplesRead % mOutputChannels == 0); mWritten += SampleToFrame(samplesRead); if (samplesRead != samplesToPop) { if (Ended()) { SINK_LOG("Last PopFrames -- Source ended."); } else if (mTreatUnderrunAsSilence) { SINK_LOG("Treat underrun frames (%u) as silence frames", SampleToFrame(samplesToPop - samplesRead)); sampleOut = samplesToPop; } else { NS_WARNING("Underrun when popping samples from audiosink ring buffer."); TRACE_COMMENT("AudioSink::PopFrames", "Underrun %u frames missing", SampleToFrame(samplesToPop - samplesRead)); } // silence the rest PodZero(aBuffer + samplesRead, samplesToPop - samplesRead); } mAudioPopped.Notify(); SINK_LOG_V("Popping %u frames. Remaining in ringbuffer %u / %u\n", aFrames, SampleToFrame(mProcessedSPSCQueue->AvailableRead()), SampleToFrame(mProcessedSPSCQueue->Capacity())); CheckIsAudible(Span(aBuffer, sampleOut), mOutputChannels); return SampleToFrame(sampleOut); } bool AudioSink::Ended() const { // Return true when error encountered so AudioStream can start draining. // Both atomic so we don't need locking return mProcessedQueueFinished || mErrored; } void AudioSink::CheckIsAudible(const Span& aInterleaved, size_t aChannel) { mAudibilityMonitor.ProcessInterleaved(aInterleaved, aChannel); bool isAudible = mAudibilityMonitor.RecentlyAudible(); if (isAudible != mIsAudioDataAudible) { mIsAudioDataAudible = isAudible; SINK_LOG("Notifying that audio is now %s", mIsAudioDataAudible ? "audible" : "inaudible"); mAudibleEvent.Notify(mIsAudioDataAudible); } } void AudioSink::OnAudioPopped() { SINK_LOG_V("AudioStream has used an audio packet."); NotifyAudioNeeded(); } void AudioSink::OnAudioPushed(const RefPtr& aSample) { SINK_LOG_V("One new audio packet available."); NotifyAudioNeeded(); } uint32_t AudioSink::AudioQueuedInRingBufferMS() const { return static_cast( 1000 * SampleToFrame(mProcessedSPSCQueue->AvailableRead()) / mOutputRate); } uint32_t AudioSink::SampleToFrame(uint32_t aSamples) const { return aSamples / mOutputChannels; } void AudioSink::NotifyAudioNeeded() { MOZ_ASSERT(mOwnerThread->IsCurrentThreadIn(), "Not called from the owner's thread"); while (mAudioQueue.GetSize() && AudioQueuedInRingBufferMS() < static_cast(mProcessedQueueThresholdMS)) { // Check if there's room in our ring buffer. if (mAudioQueue.PeekFront()->Frames() > SampleToFrame(mProcessedSPSCQueue->AvailableWrite())) { SINK_LOG_V("Can't push %u frames. In ringbuffer %u / %u\n", mAudioQueue.PeekFront()->Frames(), SampleToFrame(mProcessedSPSCQueue->AvailableRead()), SampleToFrame(mProcessedSPSCQueue->Capacity())); return; } SINK_LOG_V("Pushing %u frames. In ringbuffer %u / %u\n", mAudioQueue.PeekFront()->Frames(), SampleToFrame(mProcessedSPSCQueue->AvailableRead()), SampleToFrame(mProcessedSPSCQueue->Capacity())); RefPtr data = mAudioQueue.PopFront(); // Ignore the element with 0 frames and try next. if (!data->Frames()) { continue; } if (!mConverter || (data->mRate != mConverter->InputConfig().Rate() || data->mChannels != mConverter->InputConfig().Channels())) { SINK_LOG_V("Audio format changed from %u@%uHz to %u@%uHz", mConverter ? mConverter->InputConfig().Channels() : 0, mConverter ? mConverter->InputConfig().Rate() : 0, data->mChannels, data->mRate); DrainConverter(SampleToFrame(mProcessedSPSCQueue->AvailableWrite())); // mFramesParsed indicates the current playtime in frames at the current // input sampling rate. Recalculate it per the new sampling rate. if (mFramesParsed) { // We minimize overflow. uint32_t oldRate = mConverter->InputConfig().Rate(); uint32_t newRate = data->mRate; CheckedInt64 result = SaferMultDiv(mFramesParsed, newRate, oldRate); if (!result.isValid()) { NS_WARNING("Int overflow in AudioSink"); mErrored = true; return; } mFramesParsed = result.value(); } const AudioConfig::ChannelLayout inputLayout = data->mChannelMap ? AudioConfig::ChannelLayout::SMPTEDefault(data->mChannelMap) : AudioConfig::ChannelLayout(data->mChannels); const AudioConfig::ChannelLayout outputLayout = mOutputChannels == data->mChannels ? inputLayout : AudioConfig::ChannelLayout(mOutputChannels); AudioConfig inConfig = AudioConfig(inputLayout, data->mChannels, data->mRate); AudioConfig outConfig = AudioConfig(outputLayout, mOutputChannels, mOutputRate); if (!AudioConverter::CanConvert(inConfig, outConfig)) { mErrored = true; return; } mConverter = MakeUnique(inConfig, outConfig); } // See if there's a gap in the audio. If there is, push silence into the // audio hardware, so we can play across the gap. // Calculate the timestamp of the next chunk of audio in numbers of // samples. CheckedInt64 sampleTime = TimeUnitToFrames(data->mTime - mStartTime, data->mRate); // Calculate the number of frames that have been pushed onto the audio // hardware. CheckedInt64 missingFrames = sampleTime - mFramesParsed; if (!missingFrames.isValid() || !sampleTime.isValid()) { NS_WARNING("Int overflow in AudioSink"); mErrored = true; return; } if (missingFrames.value() > AUDIO_FUZZ_FRAMES) { // The next audio packet begins some time after the end of the last packet // we pushed to the audio hardware. We must push silence into the audio // hardware so that the next audio packet begins playback at the correct // time. But don't push more than the ring buffer can receive. missingFrames = std::min( std::min(INT32_MAX, missingFrames.value()), SampleToFrame(mProcessedSPSCQueue->AvailableWrite())); mFramesParsed += missingFrames.value(); SINK_LOG("Gap in the audio input, push %" PRId64 " frames of silence", missingFrames.value()); RefPtr silenceData; AlignedAudioBuffer silenceBuffer(missingFrames.value() * data->mChannels); if (!silenceBuffer) { NS_WARNING("OOM in AudioSink"); mErrored = true; return; } if (mConverter->InputConfig() != mConverter->OutputConfig()) { AlignedAudioBuffer convertedData = mConverter->Process(AudioSampleBuffer(std::move(silenceBuffer))) .Forget(); silenceData = CreateAudioFromBuffer(std::move(convertedData), data); } else { silenceData = CreateAudioFromBuffer(std::move(silenceBuffer), data); } TRACE("Pushing silence"); PushProcessedAudio(silenceData); } mLastEndTime = data->GetEndTime(); mFramesParsed += data->Frames(); if (mConverter->InputConfig() != mConverter->OutputConfig()) { AlignedAudioBuffer buffer(data->MoveableData()); AlignedAudioBuffer convertedData = mConverter->Process(AudioSampleBuffer(std::move(buffer))).Forget(); data = CreateAudioFromBuffer(std::move(convertedData), data); } if (PushProcessedAudio(data)) { mLastProcessedPacket = Some(data); } } if (mAudioQueue.IsFinished() && mAudioQueue.GetSize() == 0) { // We have reached the end of the data, drain the resampler. DrainConverter(SampleToFrame(mProcessedSPSCQueue->AvailableWrite())); mProcessedQueueFinished = true; } } uint32_t AudioSink::PushProcessedAudio(AudioData* aData) { if (!aData || !aData->Frames()) { return 0; } int framesToEnqueue = static_cast(aData->Frames() * aData->mChannels); TRACE_COMMENT("AudioSink::PushProcessedAudio", "%u frames (%u/%u)", framesToEnqueue, SampleToFrame(mProcessedSPSCQueue->AvailableWrite()), SampleToFrame(mProcessedSPSCQueue->Capacity())); DebugOnly rv = mProcessedSPSCQueue->Enqueue(aData->Data().Elements(), framesToEnqueue); NS_WARNING_ASSERTION( rv == static_cast(aData->Frames() * aData->mChannels), "AudioSink ring buffer over-run, can't push new data"); return aData->Frames(); } already_AddRefed AudioSink::CreateAudioFromBuffer( AlignedAudioBuffer&& aBuffer, AudioData* aReference) { uint32_t frames = SampleToFrame(aBuffer.Length()); if (!frames) { return nullptr; } auto duration = media::TimeUnit(frames, mOutputRate); if (!duration.IsValid()) { NS_WARNING("Int overflow in AudioSink"); mErrored = true; return nullptr; } RefPtr data = new AudioData(aReference->mOffset, aReference->mTime, std::move(aBuffer), mOutputChannels, mOutputRate); MOZ_DIAGNOSTIC_ASSERT(duration == data->mDuration, "must be equal"); return data.forget(); } uint32_t AudioSink::DrainConverter(uint32_t aMaxFrames) { MOZ_ASSERT(mOwnerThread->IsCurrentThreadIn()); if (!mConverter || !mLastProcessedPacket || !aMaxFrames) { // nothing to drain. return 0; } RefPtr lastPacket = mLastProcessedPacket.ref(); mLastProcessedPacket.reset(); // To drain we simply provide an empty packet to the audio converter. AlignedAudioBuffer convertedData = mConverter->Process(AudioSampleBuffer(AlignedAudioBuffer())).Forget(); uint32_t frames = SampleToFrame(convertedData.Length()); if (!convertedData.SetLength(std::min(frames, aMaxFrames) * mOutputChannels)) { // This can never happen as we were reducing the length of convertData. mErrored = true; return 0; } RefPtr data = CreateAudioFromBuffer(std::move(convertedData), lastPacket); return PushProcessedAudio(data); } void AudioSink::GetDebugInfo(dom::MediaSinkDebugInfo& aInfo) { MOZ_ASSERT(mOwnerThread->IsCurrentThreadIn()); aInfo.mAudioSinkWrapper.mAudioSink.mStartTime = mStartTime.ToMicroseconds(); aInfo.mAudioSinkWrapper.mAudioSink.mLastGoodPosition = mLastGoodPosition.ToMicroseconds(); aInfo.mAudioSinkWrapper.mAudioSink.mIsPlaying = mPlaying; aInfo.mAudioSinkWrapper.mAudioSink.mOutputRate = mOutputRate; aInfo.mAudioSinkWrapper.mAudioSink.mWritten = mWritten; aInfo.mAudioSinkWrapper.mAudioSink.mHasErrored = bool(mErrored); aInfo.mAudioSinkWrapper.mAudioSink.mPlaybackComplete = mAudioStream ? mAudioStream->IsPlaybackCompleted() : false; } void AudioSink::EnableTreatAudioUnderrunAsSilence(bool aEnabled) { SINK_LOG("set mTreatUnderrunAsSilence=%d", aEnabled); mTreatUnderrunAsSilence = aEnabled; } } // namespace mozilla