firefox/dom/media/platforms/wmf/WMFAudioMFTManager.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "WMFAudioMFTManager.h"
#include "MediaInfo.h"
#include "TimeUnits.h"
#include "VideoUtils.h"
#include "WMFUtils.h"
#include "mozilla/AbstractThread.h"
#include "mozilla/Logging.h"
#include "nsTArray.h"
#include "BufferReader.h"
#include "mozilla/ScopeExit.h"

#define LOG(...) MOZ_LOG(sPDMLog, mozilla::LogLevel::Debug, (__VA_ARGS__))

namespace mozilla {

using media::TimeUnit;

WMFAudioMFTManager::WMFAudioMFTManager(const AudioInfo& aConfig)
    : mAudioChannels(aConfig.mChannels),
      mChannelsMap(AudioConfig::ChannelLayout::UNKNOWN_MAP),
      mAudioRate(aConfig.mRate),
      mStreamType(GetStreamTypeFromMimeType(aConfig.mMimeType)) {
  MOZ_COUNT_CTOR(WMFAudioMFTManager);

  if (mStreamType == WMFStreamType::AAC) {
    const uint8_t* audioSpecConfig;
    uint32_t configLength;
    if (aConfig.mCodecSpecificConfig.is<AacCodecSpecificData>()) {
      const AacCodecSpecificData& aacCodecSpecificData =
          aConfig.mCodecSpecificConfig.as<AacCodecSpecificData>();
      audioSpecConfig =
          aacCodecSpecificData.mDecoderConfigDescriptorBinaryBlob->Elements();
      configLength =
          aacCodecSpecificData.mDecoderConfigDescriptorBinaryBlob->Length();

      mRemainingEncoderDelay = mEncoderDelay =
          aacCodecSpecificData.mEncoderDelayFrames;
      mTotalMediaFrames = aacCodecSpecificData.mMediaFrameCount;
      LOG("AudioMFT decoder: Found AAC decoder delay (%" PRIu32
          "frames) and total media frames (%" PRIu64 " frames)\n",
          mEncoderDelay, mTotalMediaFrames);
    } else {
      // Gracefully handle failure to cover all codec specific cases above. Once
      // we're confident there is no fall through from these cases above, we
      // should remove this code.
      RefPtr<MediaByteBuffer> audioCodecSpecificBinaryBlob =
          GetAudioCodecSpecificBlob(aConfig.mCodecSpecificConfig);
      audioSpecConfig = audioCodecSpecificBinaryBlob->Elements();
      configLength = audioCodecSpecificBinaryBlob->Length();
    }
    // If no extradata has been provided, assume this is ADTS. Otherwise,
    // assume raw AAC packets.
    mIsADTS = !configLength;
    AACAudioSpecificConfigToUserData(aConfig.mExtendedProfile, audioSpecConfig,
                                     configLength, mUserData);
  }
}

WMFAudioMFTManager::~WMFAudioMFTManager() {
  MOZ_COUNT_DTOR(WMFAudioMFTManager);
}

const GUID& WMFAudioMFTManager::GetMediaSubtypeGUID() {
  MOZ_ASSERT(StreamTypeIsAudio(mStreamType));
  switch (mStreamType) {
    case WMFStreamType::AAC:
      return MFAudioFormat_AAC;
    case WMFStreamType::MP3:
      return MFAudioFormat_MP3;
    default:
      return GUID_NULL;
  };
}

bool WMFAudioMFTManager::Init() {
  NS_ENSURE_TRUE(StreamTypeIsAudio(mStreamType), false);

  RefPtr<MFTDecoder> decoder(new MFTDecoder());
  // Note: MP3 MFT isn't registered as supporting Float output, but it works.
  // Find PCM output MFTs as this is the common type.
  HRESULT hr = WMFDecoderModule::CreateMFTDecoder(mStreamType, decoder);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  // Setup input/output media types
  RefPtr<IMFMediaType> inputType;

  hr = wmf::MFCreateMediaType(getter_AddRefs(inputType));
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = inputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = inputType->SetGUID(MF_MT_SUBTYPE, GetMediaSubtypeGUID());
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = inputType->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, mAudioRate);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = inputType->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS, mAudioChannels);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  if (mStreamType == WMFStreamType::AAC) {
    UINT32 payloadType = mIsADTS ? 1 : 0;
    hr = inputType->SetUINT32(MF_MT_AAC_PAYLOAD_TYPE, payloadType);
    NS_ENSURE_TRUE(SUCCEEDED(hr), false);

    hr = inputType->SetBlob(MF_MT_USER_DATA, mUserData.Elements(),
                            mUserData.Length());
    NS_ENSURE_TRUE(SUCCEEDED(hr), false);
  }

  RefPtr<IMFMediaType> outputType;
  hr = wmf::MFCreateMediaType(getter_AddRefs(outputType));
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = outputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = outputType->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_Float);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = outputType->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, 32);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  hr = decoder->SetMediaTypes(inputType, outputType);
  NS_ENSURE_TRUE(SUCCEEDED(hr), false);

  mDecoder = decoder;

  return true;
}

HRESULT
WMFAudioMFTManager::Input(MediaRawData* aSample) {
  mLastInputTime = aSample->mTime;
  return mDecoder->Input(aSample->Data(), uint32_t(aSample->Size()),
                         aSample->mTime.ToMicroseconds(),
                         aSample->mDuration.ToMicroseconds());
}

nsCString WMFAudioMFTManager::GetCodecName() const {
  if (mStreamType == WMFStreamType::AAC) {
    return "aac"_ns;
  }
  if (mStreamType == WMFStreamType::MP3) {
    return "mp3"_ns;
  }
  return "unknown"_ns;
}

HRESULT
WMFAudioMFTManager::UpdateOutputType() {
  HRESULT hr;

  RefPtr<IMFMediaType> type;
  hr = mDecoder->GetOutputMediaType(type);
  NS_ENSURE_TRUE(SUCCEEDED(hr), hr);

  hr = type->GetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, &mAudioRate);
  NS_ENSURE_TRUE(SUCCEEDED(hr), hr);

  hr = type->GetUINT32(MF_MT_AUDIO_NUM_CHANNELS, &mAudioChannels);
  NS_ENSURE_TRUE(SUCCEEDED(hr), hr);

  uint32_t channelsMap;
  hr = type->GetUINT32(MF_MT_AUDIO_CHANNEL_MASK, &channelsMap);
  if (SUCCEEDED(hr)) {
    mChannelsMap = channelsMap;
  } else {
    LOG("Unable to retrieve channel layout. Ignoring");
    mChannelsMap = AudioConfig::ChannelLayout::UNKNOWN_MAP;
  }

  return S_OK;
}

HRESULT
WMFAudioMFTManager::Output(int64_t aStreamOffset, RefPtr<MediaData>& aOutput) {
  aOutput = nullptr;
  RefPtr<IMFSample> sample;
  HRESULT hr;
  int typeChangeCount = 0;
  const auto oldAudioRate = mAudioRate;
  while (true) {
    hr = mDecoder->Output(&sample);
    if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT) {
      return hr;
    }
    if (hr == MF_E_TRANSFORM_STREAM_CHANGE) {
      hr = mDecoder->FindDecoderOutputType();
      NS_ENSURE_TRUE(SUCCEEDED(hr), hr);
      hr = UpdateOutputType();
      NS_ENSURE_TRUE(SUCCEEDED(hr), hr);
      // Catch infinite loops, but some decoders perform at least 2 stream
      // changes on consecutive calls, so be permissive.
      // 100 is arbitrarily > 2.
      NS_ENSURE_TRUE(typeChangeCount < 100, MF_E_TRANSFORM_STREAM_CHANGE);
      ++typeChangeCount;
      continue;
    }
    break;
  }

  NS_ENSURE_TRUE(SUCCEEDED(hr), hr);

  if (!sample) {
    LOG("Audio MFTDecoder returned success but null output.");
    return E_FAIL;
  }

  UINT32 discontinuity = false;
  sample->GetUINT32(MFSampleExtension_Discontinuity, &discontinuity);
  if (mFirstFrame || discontinuity) {
    // Update the output type, in case this segment has a different
    // rate. This also triggers on the first sample, which can have a
    // different rate than is advertised in the container, and sometimes we
    // don't get a MF_E_TRANSFORM_STREAM_CHANGE when the rate changes.
    hr = UpdateOutputType();
    NS_ENSURE_TRUE(SUCCEEDED(hr), hr);
    mFirstFrame = false;
  }

  LONGLONG hns;
  hr = sample->GetSampleTime(&hns);
  if (FAILED(hr)) {
    return E_FAIL;
  }
  TimeUnit pts = TimeUnit::FromHns(hns, mAudioRate);
  NS_ENSURE_TRUE(pts.IsValid(), E_FAIL);

  RefPtr<IMFMediaBuffer> buffer;
  hr = sample->ConvertToContiguousBuffer(getter_AddRefs(buffer));
  NS_ENSURE_TRUE(SUCCEEDED(hr), hr);

  BYTE* data = nullptr;  // Note: *data will be owned by the IMFMediaBuffer, we
                         // don't need to free it.
  DWORD maxLength = 0, currentLength = 0;
  hr = buffer->Lock(&data, &maxLength, &currentLength);
  ScopeExit exit([buffer] { buffer->Unlock(); });
  NS_ENSURE_TRUE(SUCCEEDED(hr), hr);

  // Output is made of floats.
  uint32_t numSamples = currentLength / sizeof(float);
  uint32_t numFrames = numSamples / mAudioChannels;
  MOZ_ASSERT(numFrames >= 0);
  MOZ_ASSERT(numSamples >= 0);
  if (numFrames == 0) {
    // All data from this chunk stripped, loop back and try to output the next
    // frame, if possible.
    return S_OK;
  }

  if (oldAudioRate != mAudioRate) {
    LOG("Audio rate changed from %" PRIu32 " to %" PRIu32, oldAudioRate,
        mAudioRate);
  }

  AlignedAudioBuffer audioData(numSamples);
  if (!audioData) {
    return E_OUTOFMEMORY;
  }

  float* floatData = reinterpret_cast<float*>(data);
  PodCopy(audioData.Data(), floatData, numSamples);

  TimeUnit duration(numFrames, mAudioRate);
  NS_ENSURE_TRUE(duration.IsValid(), E_FAIL);

  const bool isAudioRateChangedToHigher = oldAudioRate < mAudioRate;
  if (IsPartialOutput(duration, isAudioRateChangedToHigher)) {
    LOG("Encounter a partial frame?! duration shrinks from %s to %s",
        mLastOutputDuration.ToString().get(), duration.ToString().get());
    return MF_E_TRANSFORM_NEED_MORE_INPUT;
  }

  aOutput = new AudioData(aStreamOffset, pts, std::move(audioData),
                          mAudioChannels, mAudioRate, mChannelsMap);
  MOZ_DIAGNOSTIC_ASSERT(duration == aOutput->mDuration, "must be equal");
  mLastOutputDuration = aOutput->mDuration;

#ifdef LOG_SAMPLE_DECODE
  LOG("Decoded audio sample! timestamp=%lld duration=%lld currentLength=%u",
      pts.ToMicroseconds(), duration.ToMicroseconds(), currentLength);
#endif

  return S_OK;
}

bool WMFAudioMFTManager::IsPartialOutput(
    const media::TimeUnit& aNewOutputDuration,
    const bool aIsRateChangedToHigher) const {
  // This issue was found in Windows11, where AAC MFT decoder would incorrectly
  // output partial output samples to us, even if MS's documentation said it
  // won't happen [1]. More details are described in bug 1731430 comment 26.
  // If the audio rate isn't changed to higher, which would result in shorter
  // duration, but the new output duration is still shorter than the last one,
  // then new output is possible an incorrect partial output.
  // [1]
  // https://docs.microsoft.com/en-us/windows/win32/medfound/mft-message-command-drain
  if (mStreamType != WMFStreamType::AAC) {
    return false;
  }
  if (mLastOutputDuration > aNewOutputDuration && !aIsRateChangedToHigher) {
    return true;
  }
  return false;
}

void WMFAudioMFTManager::Shutdown() { mDecoder = nullptr; }

}  // namespace mozilla

#undef LOG