/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "OpusTrackEncoder.h" #include "nsString.h" #include "mozilla/CheckedInt.h" #include "mozilla/ProfilerLabels.h" #include "VideoUtils.h" #include #define LOG(args, ...) namespace mozilla { // The Opus format supports up to 8 channels, and supports multitrack audio up // to 255 channels, but the current implementation supports only mono and // stereo, and downmixes any more than that. constexpr int MAX_SUPPORTED_AUDIO_CHANNELS = 8; // http://www.opus-codec.org/docs/html_api-1.0.2/group__opus__encoder.html // In section "opus_encoder_init", channels must be 1 or 2 of input signal. constexpr int MAX_CHANNELS = 2; // A maximum data bytes for Opus to encode. constexpr int MAX_DATA_BYTES = 4096; // http://tools.ietf.org/html/draft-ietf-codec-oggopus-00#section-4 // Second paragraph, " The granule position of an audio data page is in units // of PCM audio samples at a fixed rate of 48 kHz." constexpr int kOpusSamplingRate = 48000; // The duration of an Opus frame, and it must be 2.5, 5, 10, 20, 40 or 60 ms. constexpr int kFrameDurationMs = 20; // The supported sampling rate of input signal (Hz), // must be one of the following. Will resampled to 48kHz otherwise. constexpr int kOpusSupportedInputSamplingRates[] = {8000, 12000, 16000, 24000, 48000}; namespace { // An endian-neutral serialization of integers. Serializing T in little endian // format to aOutput, where T is a 16 bits or 32 bits integer. template static void SerializeToBuffer(T aValue, nsTArray* aOutput) { for (uint32_t i = 0; i < sizeof(T); i++) { aOutput->AppendElement((uint8_t)(0x000000ff & (aValue >> (i * 8)))); } } static inline void SerializeToBuffer(const nsCString& aComment, nsTArray* aOutput) { // Format of serializing a string to buffer is, the length of string (32 bits, // little endian), and the string. SerializeToBuffer((uint32_t)(aComment.Length()), aOutput); aOutput->AppendElements(aComment.get(), aComment.Length()); } static void SerializeOpusIdHeader(uint8_t aChannelCount, uint16_t aPreskip, uint32_t aInputSampleRate, nsTArray* aOutput) { // The magic signature, null terminator has to be stripped off from strings. constexpr uint8_t magic[] = "OpusHead"; aOutput->AppendElements(magic, sizeof(magic) - 1); // The version must always be 1 (8 bits, unsigned). aOutput->AppendElement(1); // Number of output channels (8 bits, unsigned). aOutput->AppendElement(aChannelCount); // Number of samples (at 48 kHz) to discard from the decoder output when // starting playback (16 bits, unsigned, little endian). SerializeToBuffer(aPreskip, aOutput); // The sampling rate of input source (32 bits, unsigned, little endian). SerializeToBuffer(aInputSampleRate, aOutput); // Output gain, an encoder should set this field to zero (16 bits, signed, // little endian). SerializeToBuffer((int16_t)0, aOutput); // Channel mapping family. Family 0 allows only 1 or 2 channels (8 bits, // unsigned). aOutput->AppendElement(0); } static void SerializeOpusCommentHeader(const nsCString& aVendor, const nsTArray& aComments, nsTArray* aOutput) { // The magic signature, null terminator has to be stripped off. constexpr uint8_t magic[] = "OpusTags"; aOutput->AppendElements(magic, sizeof(magic) - 1); // The vendor; Should append in the following order: // vendor string length (32 bits, unsigned, little endian) // vendor string. SerializeToBuffer(aVendor, aOutput); // Add comments; Should append in the following order: // comment list length (32 bits, unsigned, little endian) // comment #0 string length (32 bits, unsigned, little endian) // comment #0 string // comment #1 string length (32 bits, unsigned, little endian) // comment #1 string ... SerializeToBuffer((uint32_t)aComments.Length(), aOutput); for (uint32_t i = 0; i < aComments.Length(); ++i) { SerializeToBuffer(aComments[i], aOutput); } } bool IsSampleRateSupported(TrackRate aSampleRate) { // According to www.opus-codec.org, creating an opus encoder requires the // sampling rate of source signal be one of 8000, 12000, 16000, 24000, or // 48000. If this constraint is not satisfied, we resample the input to 48kHz. AutoTArray supportedSamplingRates; supportedSamplingRates.AppendElements( kOpusSupportedInputSamplingRates, ArrayLength(kOpusSupportedInputSamplingRates)); return supportedSamplingRates.Contains(aSampleRate); } } // Anonymous namespace. OpusTrackEncoder::OpusTrackEncoder(TrackRate aTrackRate, MediaQueue& aEncodedDataQueue) : AudioTrackEncoder(aTrackRate, aEncodedDataQueue), mOutputSampleRate(IsSampleRateSupported(aTrackRate) ? aTrackRate : kOpusSamplingRate), mEncoder(nullptr), mLookahead(0), mLookaheadWritten(0), mResampler(nullptr), mNumOutputFrames(0) {} OpusTrackEncoder::~OpusTrackEncoder() { if (mEncoder) { opus_encoder_destroy(mEncoder); } if (mResampler) { speex_resampler_destroy(mResampler); mResampler = nullptr; } } nsresult OpusTrackEncoder::Init(int aChannels) { NS_ENSURE_TRUE((aChannels <= MAX_SUPPORTED_AUDIO_CHANNELS) && (aChannels > 0), NS_ERROR_FAILURE); // This version of encoder API only support 1 or 2 channels, // So set the mChannels less or equal 2 and // let InterleaveTrackData downmix pcm data. mChannels = aChannels > MAX_CHANNELS ? MAX_CHANNELS : aChannels; // Reject non-audio sample rates. NS_ENSURE_TRUE(mTrackRate >= 8000, NS_ERROR_INVALID_ARG); NS_ENSURE_TRUE(mTrackRate <= 192000, NS_ERROR_INVALID_ARG); if (NeedsResampler()) { int error; mResampler = speex_resampler_init(mChannels, mTrackRate, kOpusSamplingRate, SPEEX_RESAMPLER_QUALITY_DEFAULT, &error); if (error != RESAMPLER_ERR_SUCCESS) { return NS_ERROR_FAILURE; } } int error = 0; mEncoder = opus_encoder_create(mOutputSampleRate, mChannels, OPUS_APPLICATION_AUDIO, &error); if (error != OPUS_OK) { return NS_ERROR_FAILURE; } if (mAudioBitrate) { int bps = static_cast( std::min(mAudioBitrate, std::numeric_limits::max())); error = opus_encoder_ctl(mEncoder, OPUS_SET_BITRATE(bps)); if (error != OPUS_OK) { return NS_ERROR_FAILURE; } } // In the case of Opus we need to calculate the codec delay based on the // pre-skip. For more information see: // https://tools.ietf.org/html/rfc7845#section-4.2 error = opus_encoder_ctl(mEncoder, OPUS_GET_LOOKAHEAD(&mLookahead)); if (error != OPUS_OK) { mLookahead = 0; return NS_ERROR_FAILURE; } SetInitialized(); return NS_OK; } int OpusTrackEncoder::GetLookahead() const { return mLookahead * kOpusSamplingRate / mOutputSampleRate; } int OpusTrackEncoder::NumInputFramesPerPacket() const { return mTrackRate * kFrameDurationMs / 1000; } int OpusTrackEncoder::NumOutputFramesPerPacket() const { return mOutputSampleRate * kFrameDurationMs / 1000; } bool OpusTrackEncoder::NeedsResampler() const { // A resampler is needed when mTrackRate is not supported by the opus encoder. // This is equivalent to !IsSampleRateSupported(mTrackRate) but less cycles. return mTrackRate != mOutputSampleRate && mOutputSampleRate == kOpusSamplingRate; } already_AddRefed OpusTrackEncoder::GetMetadata() { AUTO_PROFILER_LABEL("OpusTrackEncoder::GetMetadata", OTHER); MOZ_ASSERT(mInitialized); if (!mInitialized) { return nullptr; } RefPtr meta = new OpusMetadata(); meta->mChannels = mChannels; meta->mSamplingFrequency = mTrackRate; // Ogg and Webm timestamps are always sampled at 48k for Opus. SerializeOpusIdHeader(mChannels, mLookahead * (kOpusSamplingRate / mOutputSampleRate), mTrackRate, &meta->mIdHeader); nsCString vendor; vendor.AppendASCII(opus_get_version_string()); nsTArray comments; comments.AppendElement( nsLiteralCString("ENCODER=Mozilla" MOZ_APP_UA_VERSION)); SerializeOpusCommentHeader(vendor, comments, &meta->mCommentHeader); return meta.forget(); } nsresult OpusTrackEncoder::Encode(AudioSegment* aSegment) { AUTO_PROFILER_LABEL("OpusTrackEncoder::Encode", OTHER); MOZ_ASSERT(aSegment); MOZ_ASSERT(mInitialized || mCanceled); if (mCanceled || IsEncodingComplete()) { return NS_ERROR_FAILURE; } if (!mInitialized) { // calculation below depends on the truth that mInitialized is true. return NS_ERROR_FAILURE; } int result = 0; // Loop until we run out of packets of input data while (result >= 0 && !IsEncodingComplete()) { // re-sampled frames left last time which didn't fit into an Opus packet // duration. const int framesLeft = mResampledLeftover.Length() / mChannels; MOZ_ASSERT(NumOutputFramesPerPacket() >= framesLeft); // Fetch input frames such that there will be n frames where (n + // framesLeft) >= NumOutputFramesPerPacket() after re-sampling. const int framesToFetch = NumInputFramesPerPacket() - (framesLeft * mTrackRate / kOpusSamplingRate) + (NeedsResampler() ? 1 : 0); if (!mEndOfStream && aSegment->GetDuration() < framesToFetch) { // Not enough raw data return NS_OK; } // Start encoding data. AutoTArray pcm; pcm.SetLength(NumOutputFramesPerPacket() * mChannels); int frameCopied = 0; for (AudioSegment::ChunkIterator iter(*aSegment); !iter.IsEnded() && frameCopied < framesToFetch; iter.Next()) { AudioChunk chunk = *iter; // Chunk to the required frame size. TrackTime frameToCopy = std::min(chunk.GetDuration(), static_cast(framesToFetch - frameCopied)); // Possible greatest value of framesToFetch = 3844: see // https://bugzilla.mozilla.org/show_bug.cgi?id=1349421#c8. frameToCopy // should not be able to exceed this value. MOZ_ASSERT(frameToCopy <= 3844, "frameToCopy exceeded expected range"); if (!chunk.IsNull()) { // Append the interleaved data to the end of pcm buffer. AudioTrackEncoder::InterleaveTrackData( chunk, frameToCopy, mChannels, pcm.Elements() + frameCopied * mChannels); } else { CheckedInt memsetLength = CheckedInt(frameToCopy) * mChannels * sizeof(AudioDataValue); if (!memsetLength.isValid()) { // This should never happen, but we use a defensive check because // we really don't want a bad memset MOZ_ASSERT_UNREACHABLE("memsetLength invalid!"); return NS_ERROR_FAILURE; } memset(pcm.Elements() + frameCopied * mChannels, 0, memsetLength.value()); } frameCopied += frameToCopy; } // Possible greatest value of framesToFetch = 3844: see // https://bugzilla.mozilla.org/show_bug.cgi?id=1349421#c8. frameCopied // should not be able to exceed this value. MOZ_ASSERT(frameCopied <= 3844, "frameCopied exceeded expected range"); int framesInPCM = frameCopied; if (mResampler) { AutoTArray resamplingDest; uint32_t inframes = frameCopied; uint32_t outframes = inframes * kOpusSamplingRate / mTrackRate + 1; // We want to consume all the input data, so we slightly oversize the // resampled data buffer so we can fit the output data in. We cannot // really predict the output frame count at each call. resamplingDest.SetLength(outframes * mChannels); #if MOZ_SAMPLE_TYPE_S16 short* in = reinterpret_cast(pcm.Elements()); short* out = reinterpret_cast(resamplingDest.Elements()); speex_resampler_process_interleaved_int(mResampler, in, &inframes, out, &outframes); #else float* in = reinterpret_cast(pcm.Elements()); float* out = reinterpret_cast(resamplingDest.Elements()); speex_resampler_process_interleaved_float(mResampler, in, &inframes, out, &outframes); #endif MOZ_ASSERT(pcm.Length() >= mResampledLeftover.Length()); PodCopy(pcm.Elements(), mResampledLeftover.Elements(), mResampledLeftover.Length()); uint32_t outframesToCopy = std::min( outframes, static_cast(NumOutputFramesPerPacket() - framesLeft)); MOZ_ASSERT(pcm.Length() - mResampledLeftover.Length() >= outframesToCopy * mChannels); PodCopy(pcm.Elements() + mResampledLeftover.Length(), resamplingDest.Elements(), outframesToCopy * mChannels); int frameLeftover = outframes - outframesToCopy; mResampledLeftover.SetLength(frameLeftover * mChannels); PodCopy(mResampledLeftover.Elements(), resamplingDest.Elements() + outframesToCopy * mChannels, mResampledLeftover.Length()); // This is always at 48000Hz. framesInPCM = framesLeft + outframesToCopy; } // Remove the raw data which has been pulled to pcm buffer. // The value of frameCopied should be equal to (or smaller than, if eos) // NumOutputFramesPerPacket(). aSegment->RemoveLeading(frameCopied); // Has reached the end of input stream and all queued data has pulled for // encoding. bool isFinalPacket = false; if (aSegment->GetDuration() == 0 && mEndOfStream && framesInPCM < NumOutputFramesPerPacket()) { // Pad |mLookahead| samples to the end of the track to prevent loss of // original data. const int toWrite = std::min(mLookahead - mLookaheadWritten, NumOutputFramesPerPacket() - framesInPCM); PodZero(pcm.Elements() + framesInPCM * mChannels, toWrite * mChannels); mLookaheadWritten += toWrite; framesInPCM += toWrite; if (mLookaheadWritten == mLookahead) { isFinalPacket = true; } } MOZ_ASSERT_IF(!isFinalPacket, framesInPCM == NumOutputFramesPerPacket()); // Append null data to pcm buffer if the leftover data is not enough for // opus encoder. if (framesInPCM < NumOutputFramesPerPacket() && isFinalPacket) { PodZero(pcm.Elements() + framesInPCM * mChannels, (NumOutputFramesPerPacket() - framesInPCM) * mChannels); } auto frameData = MakeRefPtr(); // Encode the data with Opus Encoder. frameData->SetLength(MAX_DATA_BYTES); // result is returned as opus error code if it is negative. result = 0; #ifdef MOZ_SAMPLE_TYPE_S16 const opus_int16* pcmBuf = static_cast(pcm.Elements()); result = opus_encode(mEncoder, pcmBuf, NumOutputFramesPerPacket(), frameData->Elements(), MAX_DATA_BYTES); #else const float* pcmBuf = static_cast(pcm.Elements()); result = opus_encode_float(mEncoder, pcmBuf, NumOutputFramesPerPacket(), frameData->Elements(), MAX_DATA_BYTES); #endif frameData->SetLength(result >= 0 ? result : 0); if (result < 0) { LOG("[Opus] Fail to encode data! Result: %s.", opus_strerror(result)); } if (isFinalPacket) { if (mResampler) { speex_resampler_destroy(mResampler); mResampler = nullptr; } mResampledLeftover.SetLength(0); } // timestamp should be the time of the first sample mEncodedDataQueue.Push(MakeAndAddRef( media::TimeUnit(mNumOutputFrames + mLookahead, mOutputSampleRate), static_cast(framesInPCM) * kOpusSamplingRate / mOutputSampleRate, kOpusSamplingRate, EncodedFrame::OPUS_AUDIO_FRAME, std::move(frameData))); mNumOutputFrames += NumOutputFramesPerPacket(); LOG("[Opus] mOutputTimeStamp %.3f.", media::TimeUnit(mNumOutputFrames, mOutputSampleRate).ToSeconds()); if (isFinalPacket) { LOG("[Opus] Done encoding."); mEncodedDataQueue.Finish(); } } return result >= 0 ? NS_OK : NS_ERROR_FAILURE; } } // namespace mozilla #undef LOG