From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 19 Apr 2024 03:14:29 +0200
Subject: Merging upstream version 125.0.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 dom/media/platforms/PDMFactory.cpp                 |  41 +---
 dom/media/platforms/PEMFactory.cpp                 |  10 +-
 dom/media/platforms/agnostic/bytestreams/Adts.cpp  | 234 +++++++++++++++++--
 dom/media/platforms/agnostic/bytestreams/Adts.h    | 117 +++++++++-
 .../platforms/agnostic/bytestreams/AnnexB.cpp      |  12 +-
 .../agnostic/eme/ChromiumCDMVideoDecoder.cpp       |  39 +++-
 .../platforms/agnostic/eme/EMEDecoderModule.cpp    |  18 +-
 .../platforms/agnostic/gmp/GMPDecoderModule.cpp    |  10 +
 dom/media/platforms/apple/AppleATDecoder.cpp       |  90 ++++++--
 dom/media/platforms/apple/AppleATDecoder.h         |   5 +-
 dom/media/platforms/apple/AppleDecoderModule.cpp   |   7 +-
 dom/media/platforms/apple/AppleVTDecoder.cpp       |  10 +-
 dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp  |  12 +-
 dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp    |   7 +-
 dom/media/platforms/ffmpeg/FFmpegLibWrapper.h      |  11 +-
 dom/media/platforms/ffmpeg/FFmpegRDFTTypes.h       |  34 ---
 dom/media/platforms/ffmpeg/FFmpegVideoEncoder.cpp  | 255 ++++++++++++---------
 dom/media/platforms/ffmpeg/FFmpegVideoEncoder.h    |  13 ++
 .../platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.cpp  |  14 +-
 .../platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.h    |   8 +-
 dom/media/platforms/moz.build                      |  18 +-
 dom/media/platforms/wmf/MFCDMSession.cpp           |   3 +-
 .../platforms/wmf/MFMediaEngineAudioStream.cpp     |   2 +-
 .../platforms/wmf/MFMediaEngineDecoderModule.cpp   |  16 +-
 .../platforms/wmf/MFMediaEngineDecoderModule.h     |   2 +
 dom/media/platforms/wmf/MFMediaEngineStream.cpp    |  33 +--
 dom/media/platforms/wmf/MFMediaEngineStream.h      |   8 +-
 .../platforms/wmf/MFMediaEngineVideoStream.cpp     | 108 +++++++--
 dom/media/platforms/wmf/MFMediaEngineVideoStream.h |  27 ++-
 dom/media/platforms/wmf/MFMediaSource.h            |   2 -
 dom/media/platforms/wmf/WMFAudioMFTManager.cpp     |  25 +-
 dom/media/platforms/wmf/WMFAudioMFTManager.h       |   1 +
 dom/media/platforms/wmf/WMFMediaDataEncoder.h      |   5 +-
 dom/media/platforms/wmf/WMFUtils.cpp               |  34 +--
 34 files changed, 865 insertions(+), 366 deletions(-)
 delete mode 100644 dom/media/platforms/ffmpeg/FFmpegRDFTTypes.h

(limited to 'dom/media/platforms')
diff --git a/dom/media/platforms/PDMFactory.cpp b/dom/media/platforms/PDMFactory.cpp
index 2964527e07..00f46385e2 100644
--- a/dom/media/platforms/PDMFactory.cpp
+++ b/dom/media/platforms/PDMFactory.cpp
@@ -45,9 +45,6 @@
 #    include "mozilla/CDMProxy.h"
 #  endif
 #endif
-#ifdef MOZ_FFVPX
-#  include "FFVPXRuntimeLinker.h"
-#endif
 #ifdef MOZ_FFMPEG
 #  include "FFmpegRuntimeLinker.h"
 #endif
@@ -60,6 +57,7 @@
 #ifdef MOZ_OMX
 #  include "OmxDecoderModule.h"
 #endif
+#include "FFVPXRuntimeLinker.h"
 
 #include <functional>
 
@@ -99,14 +97,12 @@ class PDMInitializer final {
 #ifdef MOZ_APPLEMEDIA
     AppleDecoderModule::Init();
 #endif
-#ifdef MOZ_FFVPX
-    FFVPXRuntimeLinker::Init();
-#endif
 #ifdef MOZ_FFMPEG
     if (StaticPrefs::media_rdd_ffmpeg_enabled()) {
       FFmpegRuntimeLinker::Init();
     }
 #endif
+    FFVPXRuntimeLinker::Init();
   }
 
   static void InitUtilityPDMs() {
@@ -127,11 +123,9 @@ class PDMInitializer final {
       AppleDecoderModule::Init();
     }
 #endif
-#ifdef MOZ_FFVPX
     if (kind == ipc::SandboxingKind::GENERIC_UTILITY) {
       FFVPXRuntimeLinker::Init();
     }
-#endif
 #ifdef MOZ_FFMPEG
     if (StaticPrefs::media_utility_ffmpeg_enabled() &&
         kind == ipc::SandboxingKind::GENERIC_UTILITY) {
@@ -160,9 +154,7 @@ class PDMInitializer final {
 #ifdef MOZ_OMX
       OmxDecoderModule::Init();
 #endif
-#ifdef MOZ_FFVPX
       FFVPXRuntimeLinker::Init();
-#endif
 #ifdef MOZ_FFMPEG
       FFmpegRuntimeLinker::Init();
 #endif
@@ -183,9 +175,7 @@ class PDMInitializer final {
 #ifdef MOZ_OMX
     OmxDecoderModule::Init();
 #endif
-#ifdef MOZ_FFVPX
     FFVPXRuntimeLinker::Init();
-#endif
 #ifdef MOZ_FFMPEG
     FFmpegRuntimeLinker::Init();
 #endif
@@ -547,12 +537,7 @@ void PDMFactory::CreateRddPDMs() {
     CreateAndStartupPDM<AppleDecoderModule>();
   }
 #endif
-#ifdef MOZ_FFVPX
-  if (StaticPrefs::media_ffvpx_enabled() &&
-      StaticPrefs::media_rdd_ffvpx_enabled()) {
-    StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
-  }
-#endif
+  StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
 #ifdef MOZ_FFMPEG
   if (StaticPrefs::media_ffmpeg_enabled() &&
       StaticPrefs::media_rdd_ffmpeg_enabled() &&
@@ -580,12 +565,9 @@ void PDMFactory::CreateUtilityPDMs() {
   }
 #endif
   if (aKind == ipc::SandboxingKind::GENERIC_UTILITY) {
-#ifdef MOZ_FFVPX
-    if (StaticPrefs::media_ffvpx_enabled() &&
-        StaticPrefs::media_utility_ffvpx_enabled()) {
+    if (StaticPrefs::media_utility_ffvpx_enabled()) {
       StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
     }
-#endif
 #ifdef MOZ_FFMPEG
     if (StaticPrefs::media_ffmpeg_enabled() &&
         StaticPrefs::media_utility_ffmpeg_enabled() &&
@@ -667,11 +649,7 @@ void PDMFactory::CreateContentPDMs() {
       CreateAndStartupPDM<OmxDecoderModule>();
     }
 #endif
-#ifdef MOZ_FFVPX
-    if (StaticPrefs::media_ffvpx_enabled()) {
-      StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
-    }
-#endif
+    StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
 #ifdef MOZ_FFMPEG
     if (StaticPrefs::media_ffmpeg_enabled() &&
         !StartupPDM(FFmpegRuntimeLinker::CreateDecoder())) {
@@ -719,11 +697,7 @@ void PDMFactory::CreateDefaultPDMs() {
     CreateAndStartupPDM<OmxDecoderModule>();
   }
 #endif
-#ifdef MOZ_FFVPX
-  if (StaticPrefs::media_ffvpx_enabled()) {
-    StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
-  }
-#endif
+  StartupPDM(FFVPXRuntimeLinker::CreateDecoder());
 #ifdef MOZ_FFMPEG
   if (StaticPrefs::media_ffmpeg_enabled() &&
       !StartupPDM(FFmpegRuntimeLinker::CreateDecoder())) {
@@ -898,9 +872,6 @@ DecodeSupportSet PDMFactory::SupportsMimeType(
 /* static */
 bool PDMFactory::AllDecodersAreRemote() {
   return StaticPrefs::media_rdd_process_enabled() &&
-#if defined(MOZ_FFVPX)
-         StaticPrefs::media_rdd_ffvpx_enabled() &&
-#endif
          StaticPrefs::media_rdd_opus_enabled() &&
          StaticPrefs::media_rdd_theora_enabled() &&
          StaticPrefs::media_rdd_vorbis_enabled() &&
diff --git a/dom/media/platforms/PEMFactory.cpp b/dom/media/platforms/PEMFactory.cpp
index 9647c5b079..a5b42914eb 100644
--- a/dom/media/platforms/PEMFactory.cpp
+++ b/dom/media/platforms/PEMFactory.cpp
@@ -20,13 +20,12 @@
 #  include "WMFEncoderModule.h"
 #endif
 
-#ifdef MOZ_FFVPX
-#  include "FFVPXRuntimeLinker.h"
-#endif
 #ifdef MOZ_FFMPEG
 #  include "FFmpegRuntimeLinker.h"
 #endif
 
+#include "FFVPXRuntimeLinker.h"
+
 #include "mozilla/StaticPrefs_media.h"
 #include "mozilla/gfx/gfxVars.h"
 
@@ -56,15 +55,12 @@ PEMFactory::PEMFactory() {
   mCurrentPEMs.AppendElement(new WMFEncoderModule());
 #endif
 
-#ifdef MOZ_FFVPX
-  if (StaticPrefs::media_ffvpx_enabled() &&
-      StaticPrefs::media_ffmpeg_encoder_enabled()) {
+  if (StaticPrefs::media_ffmpeg_encoder_enabled()) {
     if (RefPtr<PlatformEncoderModule> pem =
             FFVPXRuntimeLinker::CreateEncoder()) {
       mCurrentPEMs.AppendElement(pem);
     }
   }
-#endif
 
 #ifdef MOZ_FFMPEG
   if (StaticPrefs::media_ffmpeg_enabled() &&
diff --git a/dom/media/platforms/agnostic/bytestreams/Adts.cpp b/dom/media/platforms/agnostic/bytestreams/Adts.cpp
index 5f31904d9c..71c9f15308 100644
--- a/dom/media/platforms/agnostic/bytestreams/Adts.cpp
+++ b/dom/media/platforms/agnostic/bytestreams/Adts.cpp
@@ -4,37 +4,56 @@
 
 #include "Adts.h"
 #include "MediaData.h"
+#include "PlatformDecoderModule.h"
 #include "mozilla/Array.h"
 #include "mozilla/ArrayUtils.h"
+#include "mozilla/Logging.h"
+#include "ADTSDemuxer.h"
+
+extern mozilla::LazyLogModule gMediaDemuxerLog;
+#define LOG(msg, ...) \
+  MOZ_LOG(gMediaDemuxerLog, LogLevel::Debug, msg, ##__VA_ARGS__)
+#define ADTSLOG(msg, ...) \
+  DDMOZ_LOG(gMediaDemuxerLog, LogLevel::Debug, msg, ##__VA_ARGS__)
+#define ADTSLOGV(msg, ...) \
+  DDMOZ_LOG(gMediaDemuxerLog, LogLevel::Verbose, msg, ##__VA_ARGS__)
 
 namespace mozilla {
+namespace ADTS {
 
 static const int kADTSHeaderSize = 7;
 
-int8_t Adts::GetFrequencyIndex(uint32_t aSamplesPerSecond) {
-  static const uint32_t freq_lookup[] = {96000, 88200, 64000, 48000, 44100,
-                                         32000, 24000, 22050, 16000, 12000,
-                                         11025, 8000,  7350,  0};
+constexpr std::array FREQ_LOOKUP{96000, 88200, 64000, 48000, 44100,
+                                 32000, 24000, 22050, 16000, 12000,
+                                 11025, 8000,  7350,  0};
 
-  int8_t i = 0;
-  while (freq_lookup[i] && aSamplesPerSecond < freq_lookup[i]) {
-    i++;
-  }
+Result<uint8_t, bool> GetFrequencyIndex(uint32_t aSamplesPerSecond) {
+  auto found =
+      std::find(FREQ_LOOKUP.begin(), FREQ_LOOKUP.end(), aSamplesPerSecond);
 
-  if (!freq_lookup[i]) {
-    return -1;
+  if (found == FREQ_LOOKUP.end()) {
+    return Err(false);
   }
 
-  return i;
+  return std::distance(FREQ_LOOKUP.begin(), found);
 }
 
-bool Adts::ConvertSample(uint16_t aChannelCount, int8_t aFrequencyIndex,
-                         int8_t aProfile, MediaRawData* aSample) {
+bool ConvertSample(uint16_t aChannelCount, uint8_t aFrequencyIndex,
+                   uint8_t aProfile, MediaRawData* aSample) {
   size_t newSize = aSample->Size() + kADTSHeaderSize;
 
+  MOZ_LOG(sPDMLog, LogLevel::Debug,
+          ("Converting sample to ADTS format: newSize: %zu, ch: %u, "
+           "profile: %u, freq index: %d",
+           newSize, aChannelCount, aProfile, aFrequencyIndex));
+
   // ADTS header uses 13 bits for packet size.
-  if (newSize >= (1 << 13) || aChannelCount > 15 || aFrequencyIndex < 0 ||
-      aProfile < 1 || aProfile > 4) {
+  if (newSize >= (1 << 13) || aChannelCount > 15 || aProfile < 1 ||
+      aProfile > 4 || aFrequencyIndex >= FREQ_LOOKUP.size()) {
+    MOZ_LOG(sPDMLog, LogLevel::Debug,
+            ("Couldn't convert sample to ADTS format: newSize: %zu, ch: %u, "
+             "profile: %u, freq index: %d",
+             newSize, aChannelCount, aProfile, aFrequencyIndex));
     return false;
   }
 
@@ -66,7 +85,36 @@ bool Adts::ConvertSample(uint16_t aChannelCount, int8_t aFrequencyIndex,
   return true;
 }
 
-bool Adts::RevertSample(MediaRawData* aSample) {
+bool StripHeader(MediaRawData* aSample) {
+  if (aSample->Size() < kADTSHeaderSize) {
+    return false;
+  }
+
+  FrameHeader header;
+  auto data = Span{aSample->Data(), aSample->Size()};
+  MOZ_ASSERT(FrameHeader::MatchesSync(data),
+             "Don't attempt to strip the ADTS header of a raw AAC packet.");
+
+  bool crcPresent = header.mHaveCrc;
+
+  LOG(("Stripping ADTS, crc %spresent", crcPresent ? "" : "not "));
+
+  size_t toStrip = crcPresent ? kADTSHeaderSize + 2 : kADTSHeaderSize;
+
+  UniquePtr<MediaRawDataWriter> writer(aSample->CreateWriter());
+  writer->PopFront(toStrip);
+
+  if (aSample->mCrypto.IsEncrypted()) {
+    if (aSample->mCrypto.mPlainSizes.Length() > 0 &&
+        writer->mCrypto.mPlainSizes[0] >= kADTSHeaderSize) {
+      writer->mCrypto.mPlainSizes[0] -= kADTSHeaderSize;
+    }
+  }
+
+  return true;
+}
+
+bool RevertSample(MediaRawData* aSample) {
   if (aSample->Size() < kADTSHeaderSize) {
     return false;
   }
@@ -91,4 +139,156 @@ bool Adts::RevertSample(MediaRawData* aSample) {
 
   return true;
 }
-}  // namespace mozilla
+
+bool FrameHeader::MatchesSync(const Span<const uint8_t>& aData) {
+  return aData.Length() >= 2 && aData[0] == 0xFF && (aData[1] & 0xF6) == 0xF0;
+}
+
+FrameHeader::FrameHeader() { Reset(); }
+
+// Header size
+uint64_t FrameHeader::HeaderSize() const { return (mHaveCrc) ? 9 : 7; }
+
+bool FrameHeader::IsValid() const { return mFrameLength > 0; }
+
+// Resets the state to allow for a new parsing session.
+void FrameHeader::Reset() { PodZero(this); }
+
+// Returns whether the byte creates a valid sequence up to this point.
+bool FrameHeader::Parse(const Span<const uint8_t>& aData) {
+  if (!MatchesSync(aData)) {
+    return false;
+  }
+
+  // AAC has 1024 samples per frame per channel.
+  mSamples = 1024;
+
+  mHaveCrc = !(aData[1] & 0x01);
+  mObjectType = ((aData[2] & 0xC0) >> 6) + 1;
+  mSamplingIndex = (aData[2] & 0x3C) >> 2;
+  mChannelConfig = (aData[2] & 0x01) << 2 | (aData[3] & 0xC0) >> 6;
+  mFrameLength =
+      static_cast<uint32_t>((aData[3] & 0x03) << 11 | (aData[4] & 0xFF) << 3 |
+                            (aData[5] & 0xE0) >> 5);
+  mNumAACFrames = (aData[6] & 0x03) + 1;
+
+  static const uint32_t SAMPLE_RATES[] = {96000, 88200, 64000, 48000, 44100,
+                                          32000, 24000, 22050, 16000, 12000,
+                                          11025, 8000,  7350};
+  if (mSamplingIndex >= ArrayLength(SAMPLE_RATES)) {
+    LOG(("ADTS: Init() failure: invalid sample-rate index value: %" PRIu32 ".",
+         mSamplingIndex));
+    // This marks the header as invalid.
+    mFrameLength = 0;
+    return false;
+  }
+  mSampleRate = SAMPLE_RATES[mSamplingIndex];
+
+  MOZ_ASSERT(mChannelConfig < 8);
+  mChannels = (mChannelConfig == 7) ? 8 : mChannelConfig;
+
+  return true;
+}
+
+Frame::Frame() : mOffset(0), mHeader() {}
+uint64_t Frame::Offset() const { return mOffset; }
+size_t Frame::Length() const {
+  // TODO: If fields are zero'd when invalid, this check wouldn't be
+  // necessary.
+  if (!mHeader.IsValid()) {
+    return 0;
+  }
+
+  return mHeader.mFrameLength;
+}
+
+// Returns the offset to the start of frame's raw data.
+uint64_t Frame::PayloadOffset() const { return mOffset + mHeader.HeaderSize(); }
+
+// Returns the length of the frame's raw data (excluding the header) in bytes.
+size_t Frame::PayloadLength() const {
+  // TODO: If fields are zero'd when invalid, this check wouldn't be
+  // necessary.
+  if (!mHeader.IsValid()) {
+    return 0;
+  }
+
+  return mHeader.mFrameLength - mHeader.HeaderSize();
+}
+
+// Returns the parsed frame header.
+const FrameHeader& Frame::Header() const { return mHeader; }
+
+bool Frame::IsValid() const { return mHeader.IsValid(); }
+
+// Resets the frame header and data.
+void Frame::Reset() {
+  mHeader.Reset();
+  mOffset = 0;
+}
+
+// Returns whether the valid
+bool Frame::Parse(uint64_t aOffset, const uint8_t* aStart,
+                  const uint8_t* aEnd) {
+  MOZ_ASSERT(aStart && aEnd && aStart <= aEnd);
+
+  bool found = false;
+  const uint8_t* ptr = aStart;
+  // Require at least 7 bytes of data at the end of the buffer for the minimum
+  // ADTS frame header.
+  while (ptr < aEnd - 7 && !found) {
+    found = mHeader.Parse(Span(ptr, aEnd));
+    ptr++;
+  }
+
+  mOffset = aOffset + (static_cast<size_t>(ptr - aStart)) - 1u;
+
+  return found;
+}
+
+const Frame& FrameParser::CurrentFrame() { return mFrame; }
+
+const Frame& FrameParser::FirstFrame() const { return mFirstFrame; }
+
+void FrameParser::Reset() {
+  EndFrameSession();
+  mFirstFrame.Reset();
+}
+
+void FrameParser::EndFrameSession() { mFrame.Reset(); }
+
+bool FrameParser::Parse(uint64_t aOffset, const uint8_t* aStart,
+                        const uint8_t* aEnd) {
+  const bool found = mFrame.Parse(aOffset, aStart, aEnd);
+
+  if (mFrame.Length() && !mFirstFrame.Length()) {
+    mFirstFrame = mFrame;
+  }
+
+  return found;
+}
+
+// Initialize the AAC AudioSpecificConfig.
+// Only handles two-byte version for AAC-LC.
+void InitAudioSpecificConfig(const ADTS::Frame& frame,
+                             MediaByteBuffer* aBuffer) {
+  const ADTS::FrameHeader& header = frame.Header();
+  MOZ_ASSERT(header.IsValid());
+
+  int audioObjectType = header.mObjectType;
+  int samplingFrequencyIndex = header.mSamplingIndex;
+  int channelConfig = header.mChannelConfig;
+
+  uint8_t asc[2];
+  asc[0] = (audioObjectType & 0x1F) << 3 | (samplingFrequencyIndex & 0x0E) >> 1;
+  asc[1] = (samplingFrequencyIndex & 0x01) << 7 | (channelConfig & 0x0F) << 3;
+
+  aBuffer->AppendElements(asc, 2);
+}
+
+};  // namespace ADTS
+};  // namespace mozilla
+
+#undef LOG
+#undef ADTSLOG
+#undef ADTSLOGV
diff --git a/dom/media/platforms/agnostic/bytestreams/Adts.h b/dom/media/platforms/agnostic/bytestreams/Adts.h
index c2b6b558b6..e6d20806ab 100644
--- a/dom/media/platforms/agnostic/bytestreams/Adts.h
+++ b/dom/media/platforms/agnostic/bytestreams/Adts.h
@@ -6,17 +6,124 @@
 #define ADTS_H_
 
 #include <stdint.h>
+#include "MediaData.h"
+#include "mozilla/Result.h"
 
 namespace mozilla {
 class MediaRawData;
 
-class Adts {
+namespace ADTS {
+
+// adts::FrameHeader - Holds the ADTS frame header and its parsing
+// state.
+//
+// ADTS Frame Structure
+//
+// 11111111 1111BCCD EEFFFFGH HHIJKLMM MMMMMMMM MMMOOOOO OOOOOOPP(QQQQQQQQ
+// QQQQQQQQ)
+//
+// Header consists of 7 or 9 bytes(without or with CRC).
+// Letter   Length(bits)  Description
+// { sync } 12            syncword 0xFFF, all bits must be 1
+// B        1             MPEG Version: 0 for MPEG-4, 1 for MPEG-2
+// C        2             Layer: always 0
+// D        1             protection absent, Warning, set to 1 if there is no
+//                        CRC and 0 if there is CRC
+// E        2             profile, the MPEG-4 Audio Object Type minus 1
+// F        4             MPEG-4 Sampling Frequency Index (15 is forbidden)
+// H        3             MPEG-4 Channel Configuration (in the case of 0, the
+//                        channel configuration is sent via an in-band PCE)
+// M        13            frame length, this value must include 7 or 9 bytes of
+//                        header length: FrameLength =
+//                          (ProtectionAbsent == 1 ? 7 : 9) + size(AACFrame)
+// O        11            Buffer fullness
+// P        2             Number of AAC frames(RDBs) in ADTS frame minus 1, for
+//                        maximum compatibility always use 1 AAC frame per ADTS
+//                        frame
+// Q        16            CRC if protection absent is 0
+class FrameHeader {
  public:
-  static int8_t GetFrequencyIndex(uint32_t aSamplesPerSecond);
-  static bool ConvertSample(uint16_t aChannelCount, int8_t aFrequencyIndex,
-                            int8_t aProfile, mozilla::MediaRawData* aSample);
-  static bool RevertSample(MediaRawData* aSample);
+  uint32_t mFrameLength{};
+  uint32_t mSampleRate{};
+  uint32_t mSamples{};
+  uint32_t mChannels{};
+  uint8_t mObjectType{};
+  uint8_t mSamplingIndex{};
+  uint8_t mChannelConfig{};
+  uint8_t mNumAACFrames{};
+  bool mHaveCrc{};
+
+  // Returns whether aPtr matches a valid ADTS header sync marker
+  static bool MatchesSync(const Span<const uint8_t>& aData);
+  FrameHeader();
+  // Header size
+  uint64_t HeaderSize() const;
+  bool IsValid() const;
+  // Resets the state to allow for a new parsing session.
+  void Reset();
+
+  // Returns whether the byte creates a valid sequence up to this point.
+  bool Parse(const Span<const uint8_t>& aData);
 };
+class Frame {
+ public:
+  Frame();
+
+  uint64_t Offset() const;
+  size_t Length() const;
+  // Returns the offset to the start of frame's raw data.
+  uint64_t PayloadOffset() const;
+
+  size_t PayloadLength() const;
+  // Returns the parsed frame header.
+  const FrameHeader& Header() const;
+  bool IsValid() const;
+  // Resets the frame header and data.
+  void Reset();
+  // Returns whether the valid
+  bool Parse(uint64_t aOffset, const uint8_t* aStart, const uint8_t* aEnd);
+
+ private:
+  // The offset to the start of the header.
+  uint64_t mOffset;
+  // The currently parsed frame header.
+  FrameHeader mHeader;
+};
+
+class FrameParser {
+ public:
+  // Returns the currently parsed frame. Reset via Reset or EndFrameSession.
+  const Frame& CurrentFrame();
+  // Returns the first parsed frame. Reset via Reset.
+  const Frame& FirstFrame() const;
+  // Resets the parser. Don't use between frames as first frame data is reset.
+  void Reset();
+  // Clear the last parsed frame to allow for next frame parsing, i.e.:
+  // - sets PrevFrame to CurrentFrame
+  // - resets the CurrentFrame
+  // - resets ID3Header if no valid header was parsed yet
+  void EndFrameSession();
+  // Parses contents of given ByteReader for a valid frame header and returns
+  // true if one was found. After returning, the variable passed to
+  // 'aBytesToSkip' holds the amount of bytes to be skipped (if any) in order to
+  // jump across a large ID3v2 tag spanning multiple buffers.
+  bool Parse(uint64_t aOffset, const uint8_t* aStart, const uint8_t* aEnd);
+
+ private:
+  // We keep the first parsed frame around for static info access, the
+  // previously parsed frame for debugging and the currently parsed frame.
+  Frame mFirstFrame;
+  Frame mFrame;
+};
+
+// Extract the audiospecificconfig from an ADTS header
+void InitAudioSpecificConfig(const Frame& aFrame, MediaByteBuffer* aBuffer);
+bool StripHeader(MediaRawData* aSample);
+Result<uint8_t, bool> GetFrequencyIndex(uint32_t aSamplesPerSecond);
+bool ConvertSample(uint16_t aChannelCount, uint8_t aFrequencyIndex,
+                   uint8_t aProfile, mozilla::MediaRawData* aSample);
+bool RevertSample(MediaRawData* aSample);
+}  // namespace ADTS
 }  // namespace mozilla
 
 #endif
diff --git a/dom/media/platforms/agnostic/bytestreams/AnnexB.cpp b/dom/media/platforms/agnostic/bytestreams/AnnexB.cpp
index 086936dcc6..4721ddefc3 100644
--- a/dom/media/platforms/agnostic/bytestreams/AnnexB.cpp
+++ b/dom/media/platforms/agnostic/bytestreams/AnnexB.cpp
@@ -256,21 +256,21 @@ static Result<Ok, nsresult> FindStartCodeInternal(BufferReader& aBr) {
   while (aBr.Remaining() >= 6) {
     uint32_t x32;
     MOZ_TRY_VAR(x32, aBr.PeekU32());
-    if ((x32 - 0x01010101) & (~x32) & 0x80808080) {
-      if ((x32 >> 8) == 0x000001) {
+    if ((x32 - 0x01010101) & (~x32) & 0x80808080) {  // Has 0x00 byte(s).
+      if ((x32 >> 8) == 0x000001) {                  // 0x000001??
         return Ok();
       }
-      if (x32 == 0x000001) {
+      if ((x32 & 0xffffff) == 0x000001) {  // 0x??000001
         mozilla::Unused << aBr.Read(1);
         return Ok();
       }
-      if ((x32 & 0xff) == 0) {
+      if ((x32 & 0xff) == 0) {  // 0x??????00
         const uint8_t* p = aBr.Peek(1);
-        if ((x32 & 0xff00) == 0 && p[4] == 1) {
+        if ((x32 & 0xff00) == 0 && p[4] == 1) {  // 0x????0000,01
           mozilla::Unused << aBr.Read(2);
           return Ok();
         }
-        if (p[4] == 0 && p[5] == 1) {
+        if (p[4] == 0 && p[5] == 1) {  // 0x??????00,00,01
           mozilla::Unused << aBr.Read(3);
           return Ok();
         }
diff --git a/dom/media/platforms/agnostic/eme/ChromiumCDMVideoDecoder.cpp b/dom/media/platforms/agnostic/eme/ChromiumCDMVideoDecoder.cpp
index e71632e6d3..4c74fa8723 100644
--- a/dom/media/platforms/agnostic/eme/ChromiumCDMVideoDecoder.cpp
+++ b/dom/media/platforms/agnostic/eme/ChromiumCDMVideoDecoder.cpp
@@ -11,6 +11,9 @@
 #include "GMPVideoDecoder.h"
 #include "MP4Decoder.h"
 #include "VPXDecoder.h"
+#ifdef MOZ_AV1
+#  include "AOMDecoder.h"
+#endif
 
 namespace mozilla {
 
@@ -45,6 +48,21 @@ static uint32_t ToCDMH264Profile(uint8_t aProfile) {
   return cdm::VideoCodecProfile::kUnknownVideoCodecProfile;
 }
 
+#ifdef MOZ_AV1
+static uint32_t ToCDMAV1Profile(uint8_t aProfile) {
+  switch (aProfile) {
+    case 0:
+      return cdm::VideoCodecProfile::kAv1ProfileMain;
+    case 1:
+      return cdm::VideoCodecProfile::kAv1ProfileHigh;
+    case 2:
+      return cdm::VideoCodecProfile::kAv1ProfilePro;
+    default:
+      return cdm::VideoCodecProfile::kUnknownVideoCodecProfile;
+  }
+}
+#endif
+
 RefPtr<MediaDataDecoder::InitPromise> ChromiumCDMVideoDecoder::Init() {
   if (!mCDMParent) {
     // Must have failed to get the CDMParent from the ChromiumCDMProxy
@@ -60,6 +78,16 @@ RefPtr<MediaDataDecoder::InitPromise> ChromiumCDMVideoDecoder::Init() {
         ToCDMH264Profile(mConfig.mExtraData->SafeElementAt(1, 0));
     config.mExtraData() = mConfig.mExtraData->Clone();
     mConvertToAnnexB = true;
+#ifdef MOZ_AV1
+  } else if (AOMDecoder::IsAV1(mConfig.mMimeType)) {
+    AOMDecoder::AV1SequenceInfo seqInfo;
+    MediaResult seqHdrResult;
+    AOMDecoder::TryReadAV1CBox(mConfig.mExtraData, seqInfo, seqHdrResult);
+    config.mCodec() = cdm::VideoCodec::kCodecAv1;
+    config.mProfile() = NS_SUCCEEDED(seqHdrResult.Code())
+                            ? ToCDMAV1Profile(seqInfo.mProfile)
+                            : cdm::VideoCodecProfile::kUnknownVideoCodecProfile;
+#endif
   } else if (VPXDecoder::IsVP8(mConfig.mMimeType)) {
     config.mCodec() = cdm::VideoCodec::kCodecVp8;
     config.mProfile() = cdm::VideoCodecProfile::kProfileNotNeeded;
@@ -105,9 +133,16 @@ nsCString ChromiumCDMVideoDecoder::GetDescriptionName() const {
 nsCString ChromiumCDMVideoDecoder::GetCodecName() const {
   if (MP4Decoder::IsH264(mConfig.mMimeType)) {
     return "h264"_ns;
-  } else if (VPXDecoder::IsVP8(mConfig.mMimeType)) {
+  }
+#ifdef MOZ_AV1
+  if (AOMDecoder::IsAV1(mConfig.mMimeType)) {
+    return "av1"_ns;
+  }
+#endif
+  if (VPXDecoder::IsVP8(mConfig.mMimeType)) {
     return "vp8"_ns;
-  } else if (VPXDecoder::IsVP9(mConfig.mMimeType)) {
+  }
+  if (VPXDecoder::IsVP9(mConfig.mMimeType)) {
     return "vp9"_ns;
   }
   return "unknown"_ns;
diff --git a/dom/media/platforms/agnostic/eme/EMEDecoderModule.cpp b/dom/media/platforms/agnostic/eme/EMEDecoderModule.cpp
index c143172073..a06dd30f89 100644
--- a/dom/media/platforms/agnostic/eme/EMEDecoderModule.cpp
+++ b/dom/media/platforms/agnostic/eme/EMEDecoderModule.cpp
@@ -28,7 +28,7 @@
 
 namespace mozilla {
 
-typedef MozPromiseRequestHolder<DecryptPromise> DecryptPromiseRequestHolder;
+using DecryptPromiseRequestHolder = MozPromiseRequestHolder<DecryptPromise>;
 
 DDLoggedTypeDeclNameAndBase(EMEDecryptor, MediaDataDecoder);
 
@@ -45,7 +45,7 @@ class ADTSSampleConverter {
         // doesn't care what is set.
         ,
         mProfile(aInfo.mProfile < 1 || aInfo.mProfile > 4 ? 2 : aInfo.mProfile),
-        mFrequencyIndex(Adts::GetFrequencyIndex(aInfo.mRate)) {
+        mFrequencyIndex(ADTS::GetFrequencyIndex(aInfo.mRate).unwrapOr(255)) {
     EME_LOG("ADTSSampleConvertor(): aInfo.mProfile=%" PRIi8
             " aInfo.mExtendedProfile=%" PRIi8,
             aInfo.mProfile, aInfo.mExtendedProfile);
@@ -56,17 +56,17 @@ class ADTSSampleConverter {
     }
   }
   bool Convert(MediaRawData* aSample) const {
-    return Adts::ConvertSample(mNumChannels, mFrequencyIndex, mProfile,
+    return ADTS::ConvertSample(mNumChannels, mFrequencyIndex, mProfile,
                                aSample);
   }
   bool Revert(MediaRawData* aSample) const {
-    return Adts::RevertSample(aSample);
+    return ADTS::RevertSample(aSample);
   }
 
  private:
   const uint32_t mNumChannels;
   const uint8_t mProfile;
-  const uint8_t mFrequencyIndex;
+  const uint8_t mFrequencyIndex{};
 };
 
 class EMEDecryptor final : public MediaDataDecoder,
@@ -124,7 +124,7 @@ class EMEDecryptor final : public MediaDataDecoder,
     mThroughputLimiter->Throttle(aSample)
         ->Then(
             mThread, __func__,
-            [self](RefPtr<MediaRawData> aSample) {
+            [self](const RefPtr<MediaRawData>& aSample) {
               self->mThrottleRequest.Complete();
               self->AttemptDecode(aSample);
             },
@@ -223,7 +223,7 @@ class EMEDecryptor final : public MediaDataDecoder,
     mDecodePromise.RejectIfExists(NS_ERROR_DOM_MEDIA_CANCELED, __func__);
     mThroughputLimiter->Flush();
     for (auto iter = mDecrypts.Iter(); !iter.Done(); iter.Next()) {
-      auto holder = iter.UserData();
+      auto* holder = iter.UserData();
       holder->DisconnectIfExists();
       iter.Remove();
     }
@@ -240,7 +240,7 @@ class EMEDecryptor final : public MediaDataDecoder,
     MOZ_ASSERT(mDecodePromise.IsEmpty() && !mDecodeRequest.Exists(),
                "Must wait for decoding to complete");
     for (auto iter = mDecrypts.Iter(); !iter.Done(); iter.Next()) {
-      auto holder = iter.UserData();
+      auto* holder = iter.UserData();
       holder->DisconnectIfExists();
       iter.Remove();
     }
@@ -323,7 +323,7 @@ RefPtr<MediaDataDecoder::DecodePromise> EMEMediaDataDecoderProxy::Decode(
     mSamplesWaitingForKey->WaitIfKeyNotUsable(sample)
         ->Then(
             mThread, __func__,
-            [self, this](RefPtr<MediaRawData> aSample) {
+            [self, this](const RefPtr<MediaRawData>& aSample) {
               mKeyRequest.Complete();
 
               MediaDataDecoderProxy::Decode(aSample)
diff --git a/dom/media/platforms/agnostic/gmp/GMPDecoderModule.cpp b/dom/media/platforms/agnostic/gmp/GMPDecoderModule.cpp
index f01c7e94e4..e9c41be1f0 100644
--- a/dom/media/platforms/agnostic/gmp/GMPDecoderModule.cpp
+++ b/dom/media/platforms/agnostic/gmp/GMPDecoderModule.cpp
@@ -6,6 +6,9 @@
 
 #include "GMPDecoderModule.h"
 
+#ifdef MOZ_AV1
+#  include "AOMDecoder.h"
+#endif
 #include "DecoderDoctorDiagnostics.h"
 #include "GMPService.h"
 #include "GMPUtils.h"
@@ -43,6 +46,9 @@ static already_AddRefed<MediaDataDecoderProxy> CreateDecoderWrapper(
 already_AddRefed<MediaDataDecoder> GMPDecoderModule::CreateVideoDecoder(
     const CreateDecoderParams& aParams) {
   if (!MP4Decoder::IsH264(aParams.mConfig.mMimeType) &&
+#ifdef MOZ_AV1
+      !AOMDecoder::IsAV1(aParams.mConfig.mMimeType) &&
+#endif
       !VPXDecoder::IsVP8(aParams.mConfig.mMimeType) &&
       !VPXDecoder::IsVP9(aParams.mConfig.mMimeType)) {
     return nullptr;
@@ -63,6 +69,10 @@ media::DecodeSupportSet GMPDecoderModule::SupportsMimeType(
   AutoTArray<nsCString, 2> tags;
   if (MP4Decoder::IsH264(aMimeType)) {
     tags.AppendElement("h264"_ns);
+#ifdef MOZ_AV1
+  } else if (AOMDecoder::IsAV1(aMimeType)) {
+    tags.AppendElement("av1"_ns);
+#endif
   } else if (VPXDecoder::IsVP9(aMimeType)) {
     tags.AppendElement("vp9"_ns);
   } else if (VPXDecoder::IsVP8(aMimeType)) {
diff --git a/dom/media/platforms/apple/AppleATDecoder.cpp b/dom/media/platforms/apple/AppleATDecoder.cpp
index ed64b62d60..3065ac0c27 100644
--- a/dom/media/platforms/apple/AppleATDecoder.cpp
+++ b/dom/media/platforms/apple/AppleATDecoder.cpp
@@ -14,6 +14,9 @@
 #include "mozilla/SyncRunnable.h"
 #include "mozilla/UniquePtr.h"
 #include "nsTArray.h"
+#include "ADTSDemuxer.h"
+
+#include <array>
 
 #define LOG(...) DDMOZ_LOG(sPDMLog, mozilla::LogLevel::Debug, __VA_ARGS__)
 #define LOGEX(_this, ...) \
@@ -62,6 +65,7 @@ AppleATDecoder::~AppleATDecoder() {
 
 RefPtr<MediaDataDecoder::InitPromise> AppleATDecoder::Init() {
   if (!mFormatID) {
+    LOG("AppleATDecoder::Init failure: unknown format ID");
     return InitPromise::CreateAndReject(
         MediaResult(NS_ERROR_DOM_MEDIA_FATAL_ERR,
                     RESULT_DETAIL("Non recognised format")),
@@ -85,6 +89,7 @@ RefPtr<MediaDataDecoder::FlushPromise> AppleATDecoder::Flush() {
     }
   }
   if (mErrored) {
+    LOG("Flush error");
     mParsedFramesForAACMagicCookie = 0;
     mMagicCookie.Clear();
     ProcessShutdown();
@@ -188,18 +193,28 @@ RefPtr<MediaDataDecoder::DecodePromise> AppleATDecoder::Decode(
 
   MediaResult rv = NS_OK;
   if (!mConverter) {
+    LOG("Lazily initing the decoder");
     rv = SetupDecoder(aSample);
     if (rv != NS_OK && rv != NS_ERROR_NOT_INITIALIZED) {
+      LOG("Decoder not initialized");
       return DecodePromise::CreateAndReject(rv, __func__);
     }
   }
 
+  if (mIsADTS) {
+    bool rv = ADTS::StripHeader(aSample);
+    if (!rv) {
+      LOG("Stripping the ADTS header in AppleATDecoder failed");
+    }
+  }
+
   mQueuedSamples.AppendElement(aSample);
 
   if (rv == NS_OK) {
     for (size_t i = 0; i < mQueuedSamples.Length(); i++) {
       rv = DecodeSample(mQueuedSamples[i]);
       if (NS_FAILED(rv)) {
+        LOG("Decoding error");
         mErrored = true;
         return DecodePromise::CreateAndReject(rv, __func__);
       }
@@ -277,7 +292,7 @@ MediaResult AppleATDecoder::DecodeSample(MediaRawData* aSample) {
   }
 
   size_t numFrames = outputData.Length() / channels;
-  int rate = mOutputFormat.mSampleRate;
+  int rate = AssertedCast<int>(mOutputFormat.mSampleRate);
   media::TimeUnit duration(numFrames, rate);
   if (!duration.IsValid()) {
     NS_WARNING("Invalid count of accumulated audio samples");
@@ -340,8 +355,8 @@ MediaResult AppleATDecoder::GetInputAudioDescription(
   aDesc.mChannelsPerFrame = mConfig.mChannels;
   aDesc.mSampleRate = mConfig.mRate;
   UInt32 inputFormatSize = sizeof(aDesc);
-  OSStatus rv = AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 0, NULL,
-                                       &inputFormatSize, &aDesc);
+  OSStatus rv = AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 0,
+                                       nullptr, &inputFormatSize, &aDesc);
   if (NS_WARN_IF(rv)) {
     return MediaResult(
         NS_ERROR_FAILURE,
@@ -419,7 +434,7 @@ nsresult AppleATDecoder::SetupChannelLayout() {
   UInt32 propertySize;
   UInt32 size;
   OSStatus status = AudioConverterGetPropertyInfo(
-      mConverter, kAudioConverterOutputChannelLayout, &propertySize, NULL);
+      mConverter, kAudioConverterOutputChannelLayout, &propertySize, nullptr);
   if (status || !propertySize) {
     LOG("Couldn't get channel layout property (%s)", FourCC2Str(status));
     return NS_ERROR_FAILURE;
@@ -504,15 +519,36 @@ MediaResult AppleATDecoder::SetupDecoder(MediaRawData* aSample) {
   MOZ_ASSERT(mThread->IsOnCurrentThread());
   static const uint32_t MAX_FRAMES = 2;
 
+  bool isADTS =
+      ADTS::FrameHeader::MatchesSync(Span{aSample->Data(), aSample->Size()});
+
+  if (isADTS) {
+    ADTS::FrameParser parser;
+    if (!parser.Parse(0, aSample->Data(), aSample->Data() + aSample->Size())) {
+      LOG("ADTS frame parsing error");
+      return NS_ERROR_NOT_INITIALIZED;
+    }
+
+    AudioCodecSpecificBinaryBlob blob;
+    ADTS::InitAudioSpecificConfig(parser.FirstFrame(), blob.mBinaryBlob);
+    mConfig.mCodecSpecificConfig = AudioCodecSpecificVariant{std::move(blob)};
+    mConfig.mProfile = mConfig.mExtendedProfile =
+        parser.FirstFrame().Header().mObjectType;
+    mIsADTS = true;
+  }
+
   if (mFormatID == kAudioFormatMPEG4AAC && mConfig.mExtendedProfile == 2 &&
       mParsedFramesForAACMagicCookie < MAX_FRAMES) {
+    LOG("Attempting to get implicit AAC magic cookie");
     // Check for implicit SBR signalling if stream is AAC-LC
     // This will provide us with an updated magic cookie for use with
     // GetInputAudioDescription.
     if (NS_SUCCEEDED(GetImplicitAACMagicCookie(aSample)) &&
-        !mMagicCookie.Length()) {
+        !mMagicCookie.Length() && !isADTS) {
       // nothing found yet, will try again later
+      LOG("Getting implicit AAC magic cookie failed");
       mParsedFramesForAACMagicCookie++;
+      LOG("Not initialized -- need magic cookie");
       return NS_ERROR_NOT_INITIALIZED;
     }
     // An error occurred, fallback to using default stream description
@@ -538,6 +574,7 @@ MediaResult AppleATDecoder::SetupDecoder(MediaRawData* aSample) {
 
   MediaResult rv = GetInputAudioDescription(inputFormat, magicCookie);
   if (NS_FAILED(rv)) {
+    LOG("GetInputAudioDescription failure");
     return rv;
   }
   // Fill in the output format manually.
@@ -617,28 +654,41 @@ static void _SampleCallback(void* aSBR, UInt32 aNumBytes, UInt32 aNumPackets,
                             const void* aData,
                             AudioStreamPacketDescription* aPackets) {}
 
-nsresult AppleATDecoder::GetImplicitAACMagicCookie(
-    const MediaRawData* aSample) {
+nsresult AppleATDecoder::GetImplicitAACMagicCookie(MediaRawData* aSample) {
   MOZ_ASSERT(mThread->IsOnCurrentThread());
 
-  // Prepend ADTS header to AAC audio.
-  RefPtr<MediaRawData> adtssample(aSample->Clone());
-  if (!adtssample) {
-    return NS_ERROR_OUT_OF_MEMORY;
-  }
-  int8_t frequency_index = Adts::GetFrequencyIndex(mConfig.mRate);
+  bool isADTS =
+      ADTS::FrameHeader::MatchesSync(Span{aSample->Data(), aSample->Size()});
 
-  bool rv = Adts::ConvertSample(mConfig.mChannels, frequency_index,
-                                mConfig.mProfile, adtssample);
-  if (!rv) {
-    NS_WARNING("Failed to apply ADTS header");
-    return NS_ERROR_FAILURE;
+  RefPtr<MediaRawData> adtssample = aSample;
+
+  if (!isADTS) {
+    // Prepend ADTS header to AAC audio.
+    adtssample = aSample->Clone();
+    if (!adtssample) {
+      return NS_ERROR_OUT_OF_MEMORY;
+    }
+    auto frequency_index = ADTS::GetFrequencyIndex(mConfig.mRate);
+
+    if (frequency_index.isErr()) {
+      LOG("%d isn't a valid rate for AAC", mConfig.mRate);
+      return NS_ERROR_FAILURE;
+    }
+
+    // Arbitrarily pick main profile if not specified
+    int profile = mConfig.mProfile ? mConfig.mProfile : 1;
+    bool rv = ADTS::ConvertSample(mConfig.mChannels, frequency_index.unwrap(),
+                                  profile, adtssample);
+    if (!rv) {
+      LOG("Failed to apply ADTS header");
+      return NS_ERROR_FAILURE;
+    }
   }
   if (!mStream) {
     OSStatus rv = AudioFileStreamOpen(this, _MetadataCallback, _SampleCallback,
                                       kAudioFileAAC_ADTSType, &mStream);
     if (rv) {
-      NS_WARNING("Couldn't open AudioFileStream");
+      LOG("Couldn't open AudioFileStream");
       return NS_ERROR_FAILURE;
     }
   }
@@ -646,7 +696,7 @@ nsresult AppleATDecoder::GetImplicitAACMagicCookie(
   OSStatus status = AudioFileStreamParseBytes(
       mStream, adtssample->Size(), adtssample->Data(), 0 /* discontinuity */);
   if (status) {
-    NS_WARNING("Couldn't parse sample");
+    LOG("Couldn't parse sample");
   }
 
   if (status || mFileStreamError || mMagicCookie.Length()) {
diff --git a/dom/media/platforms/apple/AppleATDecoder.h b/dom/media/platforms/apple/AppleATDecoder.h
index d7aba2aacb..392b39993f 100644
--- a/dom/media/platforms/apple/AppleATDecoder.h
+++ b/dom/media/platforms/apple/AppleATDecoder.h
@@ -38,7 +38,7 @@ class AppleATDecoder final : public MediaDataDecoder,
   nsCString GetCodecName() const override;
 
   // Callbacks also need access to the config.
-  const AudioInfo mConfig;
+  AudioInfo mConfig;
 
   // Use to extract magic cookie for HE-AAC detection.
   nsTArray<uint8_t> mMagicCookie;
@@ -67,11 +67,12 @@ class AppleATDecoder final : public MediaDataDecoder,
   // Setup AudioConverter once all information required has been gathered.
   // Will return NS_ERROR_NOT_INITIALIZED if more data is required.
   MediaResult SetupDecoder(MediaRawData* aSample);
-  nsresult GetImplicitAACMagicCookie(const MediaRawData* aSample);
+  nsresult GetImplicitAACMagicCookie(MediaRawData* aSample);
   nsresult SetupChannelLayout();
   uint32_t mParsedFramesForAACMagicCookie;
   uint32_t mEncoderDelay = 0;
   uint64_t mTotalMediaFrames = 0;
+  bool mIsADTS = false;
   bool mErrored;
 };
 
diff --git a/dom/media/platforms/apple/AppleDecoderModule.cpp b/dom/media/platforms/apple/AppleDecoderModule.cpp
index 520685fff6..c54593a495 100644
--- a/dom/media/platforms/apple/AppleDecoderModule.cpp
+++ b/dom/media/platforms/apple/AppleDecoderModule.cpp
@@ -124,8 +124,7 @@ DecodeSupportSet AppleDecoderModule::Supports(
     case MediaCodec::VP8:
       [[fallthrough]];
     case MediaCodec::VP9:
-      if (StaticPrefs::media_ffvpx_enabled() &&
-          StaticPrefs::media_rdd_vpx_enabled() &&
+      if (StaticPrefs::media_rdd_vpx_enabled() &&
           StaticPrefs::media_utility_ffvpx_enabled()) {
         dss += DecodeSupport::SoftwareDecode;
       }
@@ -233,6 +232,7 @@ bool AppleDecoderModule::CanCreateHWDecoder(MediaCodec aCodec) {
 
 /* static */
 bool AppleDecoderModule::RegisterSupplementalVP9Decoder() {
+#ifdef XP_MACOSX
   static bool sRegisterIfAvailable = []() {
     if (__builtin_available(macos 11.0, *)) {
       VTRegisterSupplementalVideoDecoderIfAvailable(kCMVideoCodecType_VP9);
@@ -241,6 +241,9 @@ bool AppleDecoderModule::RegisterSupplementalVP9Decoder() {
     return false;
   }();
   return sRegisterIfAvailable;
+#else  // iOS
+  return false;
+#endif
 }
 
 /* static */
diff --git a/dom/media/platforms/apple/AppleVTDecoder.cpp b/dom/media/platforms/apple/AppleVTDecoder.cpp
index aae9c1fc9b..ae34c2d142 100644
--- a/dom/media/platforms/apple/AppleVTDecoder.cpp
+++ b/dom/media/platforms/apple/AppleVTDecoder.cpp
@@ -7,7 +7,7 @@
 #include "AppleVTDecoder.h"
 
 #include <CoreVideo/CVPixelBufferIOSurface.h>
-#include <IOSurface/IOSurface.h>
+#include <IOSurface/IOSurfaceRef.h>
 #include <limits>
 
 #include "AppleDecoderModule.h"
@@ -486,7 +486,6 @@ void AppleVTDecoder::OutputFrame(CVPixelBufferRef aImage,
     // Unlock the returned image data.
     CVPixelBufferUnlockBaseAddress(aImage, kCVPixelBufferLock_ReadOnly);
   } else {
-#ifndef MOZ_WIDGET_UIKIT
     // Set pixel buffer properties on aImage before we extract its surface.
     // This ensures that we can use defined enums to set values instead
     // of later setting magic CFSTR values on the surface itself.
@@ -535,9 +534,6 @@ void AppleVTDecoder::OutputFrame(CVPixelBufferRef aImage,
         info.mDisplay, aFrameRef.byte_offset, aFrameRef.composition_timestamp,
         aFrameRef.duration, image.forget(), aFrameRef.is_sync_point,
         aFrameRef.decode_timestamp);
-#else
-    MOZ_ASSERT_UNREACHABLE("No MacIOSurface on iOS");
-#endif
   }
 
   if (!data) {
@@ -719,7 +715,6 @@ CFDictionaryRef AppleVTDecoder::CreateOutputConfiguration() {
         &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
   }
 
-#ifndef MOZ_WIDGET_UIKIT
   // Output format type:
 
   bool is10Bit = (gfx::BitDepthForColorDepth(mColorDepth) == 10);
@@ -754,9 +749,6 @@ CFDictionaryRef AppleVTDecoder::CreateOutputConfiguration() {
   return CFDictionaryCreate(
       kCFAllocatorDefault, outputKeys, outputValues, ArrayLength(outputKeys),
       &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-#else
-  MOZ_ASSERT_UNREACHABLE("No MacIOSurface on iOS");
-#endif
 }
 
 }  // namespace mozilla
diff --git a/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp b/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp
index 43041f81ea..1e8e488e25 100644
--- a/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp
@@ -164,7 +164,7 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
     int16_t* data = reinterpret_cast<int16_t**>(aFrame->data)[0];
     for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
       for (uint32_t channel = 0; channel < aNumChannels; channel++) {
-        *tmp++ = AudioSampleToFloat(*data++);
+        *tmp++ = ConvertAudioSample<float>(*data++);
       }
     }
   } else if (aFrame->format == AV_SAMPLE_FMT_S16P) {
@@ -174,7 +174,7 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
     int16_t** data = reinterpret_cast<int16_t**>(aFrame->data);
     for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
       for (uint32_t channel = 0; channel < aNumChannels; channel++) {
-        *tmp++ = AudioSampleToFloat(data[channel][frame]);
+        *tmp++ = ConvertAudioSample<float>(data[channel][frame]);
       }
     }
   } else if (aFrame->format == AV_SAMPLE_FMT_S32) {
@@ -183,7 +183,7 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
     int32_t* data = reinterpret_cast<int32_t**>(aFrame->data)[0];
     for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
       for (uint32_t channel = 0; channel < aNumChannels; channel++) {
-        *tmp++ = AudioSampleToFloat(*data++);
+        *tmp++ = ConvertAudioSample<float>(*data++);
       }
     }
   } else if (aFrame->format == AV_SAMPLE_FMT_S32P) {
@@ -193,7 +193,7 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
     int32_t** data = reinterpret_cast<int32_t**>(aFrame->data);
     for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
       for (uint32_t channel = 0; channel < aNumChannels; channel++) {
-        *tmp++ = AudioSampleToFloat(data[channel][frame]);
+        *tmp++ = ConvertAudioSample<float>(data[channel][frame]);
       }
     }
   } else if (aFrame->format == AV_SAMPLE_FMT_U8) {
@@ -202,7 +202,7 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
     uint8_t* data = reinterpret_cast<uint8_t**>(aFrame->data)[0];
     for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
       for (uint32_t channel = 0; channel < aNumChannels; channel++) {
-        *tmp++ = UInt8bitToAudioSample<AudioDataValue>(*data++);
+        *tmp++ = ConvertAudioSample<float>(*data++);
       }
     }
   } else if (aFrame->format == AV_SAMPLE_FMT_U8P) {
@@ -212,7 +212,7 @@ static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
     uint8_t** data = reinterpret_cast<uint8_t**>(aFrame->data);
     for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
       for (uint32_t channel = 0; channel < aNumChannels; channel++) {
-        *tmp++ = UInt8bitToAudioSample<AudioDataValue>(data[channel][frame]);
+        *tmp++ = ConvertAudioSample<float>(data[channel][frame]);
       }
     }
   }
diff --git a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
index 4a30f2dd2d..bfb3105a57 100644
--- a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
@@ -177,9 +177,6 @@ FFmpegLibWrapper::LinkResult FFmpegLibWrapper::Link() {
   AV_FUNC(av_packet_alloc, (AV_FUNC_57 | AV_FUNC_58 | AV_FUNC_59 | AV_FUNC_60))
   AV_FUNC(av_packet_unref, (AV_FUNC_57 | AV_FUNC_58 | AV_FUNC_59 | AV_FUNC_60))
   AV_FUNC(av_packet_free, (AV_FUNC_57 | AV_FUNC_58 | AV_FUNC_59 | AV_FUNC_60))
-  AV_FUNC_OPTION(av_rdft_init, AV_FUNC_AVCODEC_ALL)
-  AV_FUNC_OPTION(av_rdft_calc, AV_FUNC_AVCODEC_ALL)
-  AV_FUNC_OPTION(av_rdft_end, AV_FUNC_AVCODEC_ALL)
   AV_FUNC(avcodec_descriptor_get, AV_FUNC_AVCODEC_ALL)
   AV_FUNC(av_log_set_level, AV_FUNC_AVUTIL_ALL)
   AV_FUNC(av_malloc, AV_FUNC_AVUTIL_ALL)
@@ -254,6 +251,10 @@ FFmpegLibWrapper::LinkResult FFmpegLibWrapper::Link() {
                                                    AV_FUNC_AVUTIL_59 |
                                                    AV_FUNC_AVUTIL_60)
 #endif
+
+  AV_FUNC_OPTION(av_tx_init, AV_FUNC_AVUTIL_ALL)
+  AV_FUNC_OPTION(av_tx_uninit, AV_FUNC_AVUTIL_ALL)
+
 #undef AV_FUNC
 #undef AV_FUNC_OPTION
 
diff --git a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
index 98ab2f7930..eacbba286a 100644
--- a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
+++ b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
@@ -5,9 +5,9 @@
 #ifndef __FFmpegLibWrapper_h__
 #define __FFmpegLibWrapper_h__
 
-#include "FFmpegRDFTTypes.h"  // for AvRdftInitFn, etc.
 #include "mozilla/Attributes.h"
 #include "mozilla/Types.h"
+#include "ffvpx/tx.h"
 
 struct AVCodec;
 struct AVCodecContext;
@@ -148,11 +148,6 @@ struct MOZ_ONLY_USED_TO_AVOID_STATIC_CONSTRUCTORS FFmpegLibWrapper {
   int (*avcodec_send_frame)(AVCodecContext* avctx, const AVFrame* frame);
   int (*avcodec_receive_frame)(AVCodecContext* avctx, AVFrame* frame);
 
-  // libavcodec optional
-  AvRdftInitFn av_rdft_init;
-  AvRdftCalcFn av_rdft_calc;
-  AvRdftEndFn av_rdft_end;
-
   // libavutil
   void (*av_log_set_level)(int level);
   void* (*av_malloc)(size_t size);
@@ -216,6 +211,10 @@ struct MOZ_ONLY_USED_TO_AVOID_STATIC_CONSTRUCTORS FFmpegLibWrapper {
   void* (*vaGetDisplayDRM)(int fd);
 #endif
 
+  // Only ever used with ffvpx
+  decltype(::av_tx_init)* av_tx_init;
+  decltype(::av_tx_uninit)* av_tx_uninit;
+
   PRLibrary* mAVCodecLib;
   PRLibrary* mAVUtilLib;
 #ifdef MOZ_WIDGET_GTK
diff --git a/dom/media/platforms/ffmpeg/FFmpegRDFTTypes.h b/dom/media/platforms/ffmpeg/FFmpegRDFTTypes.h
deleted file mode 100644
index cb3e2476fb..0000000000
--- a/dom/media/platforms/ffmpeg/FFmpegRDFTTypes.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* vim: set ts=8 sts=2 et sw=2 tw=80: */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
-
-#ifndef FFmpegRDFTTypes_h
-#define FFmpegRDFTTypes_h
-
-struct RDFTContext;
-
-typedef float FFTSample;
-
-enum RDFTransformType {
-  DFT_R2C,
-  IDFT_C2R,
-  IDFT_R2C,
-  DFT_C2R,
-};
-
-extern "C" {
-
-typedef RDFTContext* (*AvRdftInitFn)(int nbits, enum RDFTransformType trans);
-typedef void (*AvRdftCalcFn)(RDFTContext* s, FFTSample* data);
-typedef void (*AvRdftEndFn)(RDFTContext* s);
-}
-
-struct FFmpegRDFTFuncs {
-  AvRdftInitFn init;
-  AvRdftCalcFn calc;
-  AvRdftEndFn end;
-};
-
-#endif  // FFmpegRDFTTypes_h
diff --git a/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.cpp b/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.cpp
index dcc3d9a88d..a3cfdf1b1d 100644
--- a/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.cpp
@@ -504,128 +504,64 @@ MediaResult FFmpegVideoEncoder<LIBAV_VER>::InitInternal() {
   mCodecContext->flags |= AV_CODEC_FLAG_FRAME_DURATION;
 #endif
   mCodecContext->gop_size = static_cast<int>(mConfig.mKeyframeInterval);
-  // TODO (bug 1872871): Move the following extra settings to some helpers
-  // instead.
+
   if (mConfig.mUsage == MediaDataEncoder::Usage::Realtime) {
     mLib->av_opt_set(mCodecContext->priv_data, "deadline", "realtime", 0);
     // Explicitly ask encoder do not keep in flight at any one time for
     // lookahead purposes.
     mLib->av_opt_set(mCodecContext->priv_data, "lag-in-frames", "0", 0);
   }
-  // Apply SVC settings.
-  if (Maybe<VPXSVCSetting> svc =
-          GetVPXSVCSetting(mConfig.mScalabilityMode, mConfig.mBitrate)) {
-    // For libvpx.
-    if (mCodecName == "libvpx" || mCodecName == "libvpx-vp9") {
-      // Show a warning if mScalabilityMode mismatches mNumTemporalLayers
-      if (mConfig.mCodecSpecific) {
-        if (mConfig.mCodecSpecific->is<VP8Specific>() ||
-            mConfig.mCodecSpecific->is<VP9Specific>()) {
-          const uint8_t numTemporalLayers =
-              mConfig.mCodecSpecific->is<VP8Specific>()
-                  ? mConfig.mCodecSpecific->as<VP8Specific>().mNumTemporalLayers
-                  : mConfig.mCodecSpecific->as<VP9Specific>()
-                        .mNumTemporalLayers;
-          if (numTemporalLayers != svc->mNumberLayers) {
-            FFMPEGV_LOG(
-                "Force using %zu layers defined in scalability mode instead of "
-                "the %u layers defined in VP8/9Specific",
-                svc->mNumberLayers, numTemporalLayers);
-          }
-        }
-      }
 
-      // Set ts_layering_mode.
-      nsPrintfCString parameters("ts_layering_mode=%u", svc->mLayeringMode);
-      // Set ts_target_bitrate.
-      parameters.Append(":ts_target_bitrate=");
-      for (size_t i = 0; i < svc->mTargetBitrates.Length(); ++i) {
-        if (i > 0) {
-          parameters.Append(",");
-        }
-        parameters.AppendPrintf("%d", svc->mTargetBitrates[i]);
-      }
-      // TODO: Set ts_number_layers, ts_periodicity, ts_layer_id and
-      // ts_rate_decimator if they are different from the preset values in
-      // ts_layering_mode.
-
-      // Set parameters into ts-parameters.
-      mLib->av_opt_set(mCodecContext->priv_data, "ts-parameters",
-                       parameters.get(), 0);
-
-      // FFmpegVideoEncoder would be reset after Drain(), so mSVCInfo should be
-      // reset() before emplace().
-      mSVCInfo.reset();
-      mSVCInfo.emplace(std::move(svc->mLayerIds));
-
-      // TODO: layer settings should be changed dynamically when the frame's
-      // color space changed.
-    } else {
-      FFMPEGV_LOG("SVC setting is not implemented for %s codec",
-                  mCodecName.get());
-    }
+  if (Maybe<SVCSettings> settings = GetSVCSettings()) {
+    SVCSettings s = settings.extract();
+    mLib->av_opt_set(mCodecContext->priv_data, s.mSettingKeyValue.first.get(),
+                     s.mSettingKeyValue.second.get(), 0);
+
+    // FFmpegVideoEncoder is reset after Drain(), so mSVCInfo should be reset()
+    // before emplace().
+    mSVCInfo.reset();
+    mSVCInfo.emplace(std::move(s.mTemporalLayerIds));
+
+    // TODO: layer settings should be changed dynamically when the frame's
+    // color space changed.
   }
-  // Apply codec specific settings.
-  nsAutoCString codecSpecificLog;
-  if (mConfig.mCodecSpecific) {
-    if (mConfig.mCodecSpecific->is<H264Specific>()) {
-      // For libx264.
-      if (mCodecName == "libx264") {
-        codecSpecificLog.Append(", H264:");
-
-        const H264Specific& specific =
-            mConfig.mCodecSpecific->as<H264Specific>();
-
-        // Set profile.
-        Maybe<H264Setting> profile = GetH264Profile(specific.mProfile);
-        if (!profile) {
-          FFMPEGV_LOG("failed to get h264 profile");
-          return MediaResult(NS_ERROR_DOM_MEDIA_NOT_SUPPORTED_ERR,
-                             RESULT_DETAIL("H264 profile is unknown"));
-        }
-        codecSpecificLog.Append(
-            nsPrintfCString(" profile - %d", profile->mValue));
-        mCodecContext->profile = profile->mValue;
-        if (!profile->mString.IsEmpty()) {
-          codecSpecificLog.AppendPrintf(" (%s)", profile->mString.get());
-          mLib->av_opt_set(mCodecContext->priv_data, "profile",
-                           profile->mString.get(), 0);
-        }
-
-        // Set level.
-        Maybe<H264Setting> level = GetH264Level(specific.mLevel);
-        if (!level) {
-          FFMPEGV_LOG("failed to get h264 level");
-          return MediaResult(NS_ERROR_DOM_MEDIA_NOT_SUPPORTED_ERR,
-                             RESULT_DETAIL("H264 level is unknown"));
-        }
-        codecSpecificLog.AppendPrintf(", level %d (%s)", level->mValue,
-                                      level->mString.get());
-        mCodecContext->level = level->mValue;
-        MOZ_ASSERT(!level->mString.IsEmpty());
-        mLib->av_opt_set(mCodecContext->priv_data, "level",
-                         level->mString.get(), 0);
-
-        // Set format: libx264's default format is annexb
-        if (specific.mFormat == H264BitStreamFormat::AVC) {
-          codecSpecificLog.Append(", AVCC");
-          mLib->av_opt_set(mCodecContext->priv_data, "x264-params", "annexb=0",
-                           0);
-          // mCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER
-          // if we don't want to append SPS/PPS data in all keyframe
-          // (LIBAVCODEC_VERSION_MAJOR >= 57 only).
-        } else {
-          codecSpecificLog.Append(", AnnexB");
-          // Set annexb explicitly even if it's default format.
-          mLib->av_opt_set(mCodecContext->priv_data, "x264-params", "annexb=1",
-                           0);
-        }
-      } else {
-        FFMPEGV_LOG("H264 settings is not implemented for codec %s ",
-                    mCodecName.get());
+
+  nsAutoCString h264Log;
+  if (mConfig.mCodecSpecific && mConfig.mCodecSpecific->is<H264Specific>()) {
+    // TODO: Set profile, level, avcc/annexb for openh264 and others.
+    if (mCodecName == "libx264") {
+      const H264Specific& h264Specific =
+          mConfig.mCodecSpecific->as<H264Specific>();
+      H264Settings s = GetH264Settings(h264Specific);
+      mCodecContext->profile = s.mProfile;
+      mCodecContext->level = s.mLevel;
+      for (const auto& pair : s.mSettingKeyValuePairs) {
+        mLib->av_opt_set(mCodecContext->priv_data, pair.first.get(),
+                         pair.second.get(), 0);
       }
+
+      // Log the settings.
+      // When using profile other than EXTENDED, the profile string is in the
+      // first element of mSettingKeyValuePairs, while EXTENDED profile has no
+      // profile string.
+
+      MOZ_ASSERT_IF(
+          s.mSettingKeyValuePairs.Length() != 3,
+          h264Specific.mProfile == H264_PROFILE::H264_PROFILE_EXTENDED);
+      const char* profileStr = s.mSettingKeyValuePairs.Length() == 3
+                                   ? s.mSettingKeyValuePairs[0].second.get()
+                                   : "extended";
+      const char* levelStr = s.mSettingKeyValuePairs.Length() == 3
+                                 ? s.mSettingKeyValuePairs[1].second.get()
+                                 : s.mSettingKeyValuePairs[0].second.get();
+      const char* formatStr =
+          h264Specific.mFormat == H264BitStreamFormat::AVC ? "AVCC" : "AnnexB";
+      h264Log.AppendPrintf(", H264: profile - %d (%s), level %d (%s), %s",
+                           mCodecContext->profile, profileStr,
+                           mCodecContext->level, levelStr, formatStr);
     }
   }
+
   // TODO: keyint_min, max_b_frame?
   // - if mConfig.mDenoising is set: av_opt_set_int(mCodecContext->priv_data,
   // "noise_sensitivity", x, 0), where the x is from 0(disabled) to 6.
@@ -657,7 +593,7 @@ MediaResult FFmpegVideoEncoder<LIBAV_VER>::InitInternal() {
               static_cast<int64_t>(mCodecContext->bit_rate),
               mCodecContext->width, mCodecContext->height,
               mCodecContext->time_base.num, mCodecContext->time_base.den,
-              codecSpecificLog.IsEmpty() ? "" : codecSpecificLog.get());
+              h264Log.IsEmpty() ? "" : h264Log.get());
 
   return MediaResult(NS_OK);
 }
@@ -1152,4 +1088,99 @@ void FFmpegVideoEncoder<LIBAV_VER>::ForceEnablingFFmpegDebugLogs() {
 #endif  // DEBUG
 }
 
+Maybe<FFmpegVideoEncoder<LIBAV_VER>::SVCSettings>
+FFmpegVideoEncoder<LIBAV_VER>::GetSVCSettings() {
+  MOZ_ASSERT(!mCodecName.IsEmpty());
+
+  // TODO: Add support for AV1 and H264.
+  if (mCodecName != "libvpx" && mCodecName != "libvpx-vp9") {
+    FFMPEGV_LOG("SVC setting is not implemented for %s codec",
+                mCodecName.get());
+    return Nothing();
+  }
+
+  Maybe<VPXSVCSetting> svc =
+      GetVPXSVCSetting(mConfig.mScalabilityMode, mConfig.mBitrate);
+  if (!svc) {
+    FFMPEGV_LOG("No SVC settings obtained. Skip");
+    return Nothing();
+  }
+
+  // Check if the number of temporal layers in codec specific settings matches
+  // the number of layers for the given scalability mode.
+
+  auto GetNumTemporalLayers = [&]() -> uint8_t {
+    uint8_t layers = 0;
+    if (mConfig.mCodecSpecific) {
+      if (mConfig.mCodecSpecific->is<VP8Specific>()) {
+        layers = mConfig.mCodecSpecific->as<VP8Specific>().mNumTemporalLayers;
+        MOZ_ASSERT(layers > 0);
+      } else if (mConfig.mCodecSpecific->is<VP9Specific>()) {
+        layers = mConfig.mCodecSpecific->as<VP9Specific>().mNumTemporalLayers;
+        MOZ_ASSERT(layers > 0);
+      }
+    }
+    return layers;
+  };
+
+  DebugOnly<uint8_t> numTemporalLayers = GetNumTemporalLayers();
+  MOZ_ASSERT_IF(numTemporalLayers > 0, numTemporalLayers == svc->mNumberLayers);
+
+  // Form an SVC setting string for libvpx.
+
+  nsPrintfCString parameters("ts_layering_mode=%u", svc->mLayeringMode);
+  parameters.Append(":ts_target_bitrate=");
+  for (size_t i = 0; i < svc->mTargetBitrates.Length(); ++i) {
+    if (i > 0) {
+      parameters.Append(",");
+    }
+    parameters.AppendPrintf("%d", svc->mTargetBitrates[i]);
+  }
+
+  // TODO: Set ts_number_layers, ts_periodicity, ts_layer_id and
+  // ts_rate_decimator if they are different from the preset values in
+  // ts_layering_mode.
+
+  return Some(
+      SVCSettings{std::move(svc->mLayerIds),
+                  std::make_pair("ts-parameters"_ns, std::move(parameters))});
+}
+
+FFmpegVideoEncoder<LIBAV_VER>::H264Settings FFmpegVideoEncoder<
+    LIBAV_VER>::GetH264Settings(const H264Specific& aH264Specific) {
+  MOZ_ASSERT(mCodecName == "libx264",
+             "GetH264Settings is libx264-only for now");
+
+  nsTArray<std::pair<nsCString, nsCString>> keyValuePairs;
+
+  Maybe<H264Setting> profile = GetH264Profile(aH264Specific.mProfile);
+  MOZ_RELEASE_ASSERT(profile.isSome());
+  if (!profile->mString.IsEmpty()) {
+    keyValuePairs.AppendElement(std::make_pair("profile"_ns, profile->mString));
+  } else {
+    MOZ_RELEASE_ASSERT(aH264Specific.mProfile ==
+                       H264_PROFILE::H264_PROFILE_EXTENDED);
+  }
+
+  Maybe<H264Setting> level = GetH264Level(aH264Specific.mLevel);
+  MOZ_RELEASE_ASSERT(level.isSome());
+  MOZ_RELEASE_ASSERT(!level->mString.IsEmpty());
+  keyValuePairs.AppendElement(std::make_pair("level"_ns, level->mString));
+
+  // Set format: libx264's default format is annexb.
+  if (aH264Specific.mFormat == H264BitStreamFormat::AVC) {
+    keyValuePairs.AppendElement(std::make_pair("x264-params"_ns, "annexb=0"));
+    // mCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER
+    // if we don't want to append SPS/PPS data in all keyframe
+    // (LIBAVCODEC_VERSION_MAJOR >= 57 only).
+  } else {
+    // Set annexb explicitly even if it's default format.
+    keyValuePairs.AppendElement(std::make_pair("x264-params"_ns, "annexb=1"));
+  }
+
+  return H264Settings{.mProfile = profile->mValue,
+                      .mLevel = level->mValue,
+                      .mSettingKeyValuePairs = std::move(keyValuePairs)};
+}
+
 }  // namespace mozilla
diff --git a/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.h b/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.h
index 1bcdd3eaf9..07c433ddd7 100644
--- a/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.h
+++ b/dom/media/platforms/ffmpeg/FFmpegVideoEncoder.h
@@ -75,6 +75,19 @@ class FFmpegVideoEncoder<LIBAV_VER> final : public MediaDataEncoder {
   Result<already_AddRefed<MediaByteBuffer>, nsresult> GetExtraData(
       AVPacket* aPacket);
   void ForceEnablingFFmpegDebugLogs();
+  struct SVCSettings {
+    nsTArray<uint8_t> mTemporalLayerIds;
+    // A key-value pair for av_opt_set.
+    std::pair<nsCString, nsCString> mSettingKeyValue;
+  };
+  Maybe<SVCSettings> GetSVCSettings();
+  struct H264Settings {
+    int mProfile;
+    int mLevel;
+    // A list of key-value pairs for av_opt_set.
+    nsTArray<std::pair<nsCString, nsCString>> mSettingKeyValuePairs;
+  };
+  H264Settings GetH264Settings(const H264Specific& aH264Specific);
 
   // This refers to a static FFmpegLibWrapper, so raw pointer is adequate.
   const FFmpegLibWrapper* mLib;
diff --git a/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.cpp b/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.cpp
index ba9ca4834e..dfc8244f1d 100644
--- a/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.cpp
+++ b/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.cpp
@@ -145,19 +145,13 @@ already_AddRefed<PlatformEncoderModule> FFVPXRuntimeLinker::CreateEncoder() {
 }
 
 /* static */
-void FFVPXRuntimeLinker::GetRDFTFuncs(FFmpegRDFTFuncs* aOutFuncs) {
+void FFVPXRuntimeLinker::GetFFTFuncs(FFmpegFFTFuncs* aOutFuncs) {
   []() MOZ_NO_THREAD_SAFETY_ANALYSIS {
     MOZ_ASSERT(sLinkStatus != LinkStatus_INIT);
   }();
-  if (sFFVPXLib.av_rdft_init && sFFVPXLib.av_rdft_calc &&
-      sFFVPXLib.av_rdft_end) {
-    aOutFuncs->init = sFFVPXLib.av_rdft_init;
-    aOutFuncs->calc = sFFVPXLib.av_rdft_calc;
-    aOutFuncs->end = sFFVPXLib.av_rdft_end;
-  } else {
-    NS_WARNING("RDFT functions expected but not found");
-    *aOutFuncs = FFmpegRDFTFuncs();  // zero
-  }
+  MOZ_ASSERT(sFFVPXLib.av_tx_init && sFFVPXLib.av_tx_uninit);
+  aOutFuncs->init = sFFVPXLib.av_tx_init;
+  aOutFuncs->uninit = sFFVPXLib.av_tx_uninit;
 }
 
 }  // namespace mozilla
diff --git a/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.h b/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.h
index e52f108272..dccd37c7da 100644
--- a/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.h
+++ b/dom/media/platforms/ffmpeg/ffvpx/FFVPXRuntimeLinker.h
@@ -11,8 +11,12 @@
 #include "PlatformEncoderModule.h"
 #include "mozilla/StaticMutex.h"
 #include "mozilla/ThreadSafety.h"
+#include "ffvpx/tx.h"
 
-struct FFmpegRDFTFuncs;
+struct FFmpegFFTFuncs {
+  decltype(av_tx_init)* init;
+  decltype(av_tx_uninit)* uninit;
+};
 
 namespace mozilla {
 
@@ -23,7 +27,7 @@ class FFVPXRuntimeLinker {
   static already_AddRefed<PlatformEncoderModule> CreateEncoder();
 
   // Call (on any thread) after Init().
-  static void GetRDFTFuncs(FFmpegRDFTFuncs* aOutFuncs);
+  static void GetFFTFuncs(FFmpegFFTFuncs* aOutFuncs);
 
  private:
   // Provide critical-section for Init() and sLinkStatus.
diff --git a/dom/media/platforms/moz.build b/dom/media/platforms/moz.build
index 34acf4e8d1..6f71c5cc12 100644
--- a/dom/media/platforms/moz.build
+++ b/dom/media/platforms/moz.build
@@ -47,19 +47,13 @@ DIRS += ["agnostic/bytestreams", "agnostic/eme", "agnostic/gmp", "omx"]
 if CONFIG["MOZ_WMF"]:
     DIRS += ["wmf"]
 
-if CONFIG["MOZ_FFVPX"] or CONFIG["MOZ_FFMPEG"]:
-    # common code to either FFmpeg or FFVPX
-    EXPORTS += [
-        "ffmpeg/FFmpegRDFTTypes.h",
-    ]
-    UNIFIED_SOURCES += [
-        "ffmpeg/FFmpegLibWrapper.cpp",
-    ]
+UNIFIED_SOURCES += [
+    "ffmpeg/FFmpegLibWrapper.cpp",
+]
 
-if CONFIG["MOZ_FFVPX"]:
-    DIRS += [
-        "ffmpeg/ffvpx",
-    ]
+DIRS += [
+    "ffmpeg/ffvpx",
+]
 
 if CONFIG["MOZ_FFMPEG"]:
     DIRS += [
diff --git a/dom/media/platforms/wmf/MFCDMSession.cpp b/dom/media/platforms/wmf/MFCDMSession.cpp
index cec783cbc6..b797898abb 100644
--- a/dom/media/platforms/wmf/MFCDMSession.cpp
+++ b/dom/media/platforms/wmf/MFCDMSession.cpp
@@ -304,8 +304,7 @@ void MFCDMSession::OnSessionKeyMessage(
       case MF_MEDIAKEYSESSION_MESSAGETYPE_INDIVIDUALIZATION_REQUEST:
         return dom::MediaKeyMessageType::Individualization_request;
       default:
-        MOZ_ASSERT_UNREACHABLE("Unknown session message type");
-        return dom::MediaKeyMessageType::EndGuard_;
+        MOZ_CRASH("Unknown session message type");
     }
   };
   LOG("Notify 'keymessage' for %s", NS_ConvertUTF16toUTF8(*mSessionId).get());
diff --git a/dom/media/platforms/wmf/MFMediaEngineAudioStream.cpp b/dom/media/platforms/wmf/MFMediaEngineAudioStream.cpp
index 4acf26e041..969f817882 100644
--- a/dom/media/platforms/wmf/MFMediaEngineAudioStream.cpp
+++ b/dom/media/platforms/wmf/MFMediaEngineAudioStream.cpp
@@ -93,7 +93,7 @@ HRESULT MFMediaEngineAudioStream::CreateMediaType(const TrackInfo& aInfo,
 bool MFMediaEngineAudioStream::HasEnoughRawData() const {
   // If more than this much raw audio is queued, we'll hold off request more
   // audio.
-  return mRawDataQueueForFeedingEngine.Duration() >=
+  return mRawDataQueueForFeedingEngine.PreciseDuration() >=
          StaticPrefs::media_wmf_media_engine_raw_data_threshold_audio();
 }
 
diff --git a/dom/media/platforms/wmf/MFMediaEngineDecoderModule.cpp b/dom/media/platforms/wmf/MFMediaEngineDecoderModule.cpp
index 5b99fb0f2c..e291ab6a54 100644
--- a/dom/media/platforms/wmf/MFMediaEngineDecoderModule.cpp
+++ b/dom/media/platforms/wmf/MFMediaEngineDecoderModule.cpp
@@ -6,6 +6,7 @@
 
 #include "MFTDecoder.h"
 #include "VideoUtils.h"
+#include "mozilla/gfx/gfxVars.h"
 #include "mozilla/MFMediaEngineParent.h"
 #include "mozilla/MFMediaEngineUtils.h"
 #include "mozilla/RemoteDecoderManagerChild.h"
@@ -99,6 +100,11 @@ media::DecodeSupportSet MFMediaEngineDecoderModule::SupportInternal(
   if (!StaticPrefs::media_wmf_media_engine_enabled()) {
     return media::DecodeSupportSet{};
   }
+  // Only support hardware decoding.
+  if (!gfx::gfxVars::CanUseHardwareVideoDecoding() &&
+      !StaticPrefs::media_wmf_media_engine_bypass_gfx_blocklist()) {
+    return media::DecodeSupportSet{};
+  }
   bool supports = false;
   WMFStreamType type = GetStreamTypeFromMimeType(aParams.MimeType());
   if (type != WMFStreamType::Unknown) {
@@ -107,13 +113,11 @@ media::DecodeSupportSet MFMediaEngineDecoderModule::SupportInternal(
   MOZ_LOG(sPDMLog, LogLevel::Debug,
           ("MFMediaEngine decoder %s requested type '%s'",
            supports ? "supports" : "rejects", aParams.MimeType().get()));
-  // We only support HEVC hardware decoding.
-  if (supports && type == WMFStreamType::HEVC) {
-    return media::DecodeSupport::HardwareDecode;
+  if (!supports) {
+    return media::DecodeSupportSet{};
   }
-  // TODO : find a way to report accurate result.
-  return supports ? media::DecodeSupport::SoftwareDecode
-                  : media::DecodeSupportSet{};
+  return StreamTypeIsVideo(type) ? media::DecodeSupport::HardwareDecode
+                                 : media::DecodeSupport::SoftwareDecode;
 }
 
 static bool CreateMFTDecoderOnMTA(const WMFStreamType& aType) {
diff --git a/dom/media/platforms/wmf/MFMediaEngineDecoderModule.h b/dom/media/platforms/wmf/MFMediaEngineDecoderModule.h
index c23b9010cc..1c8de5a161 100644
--- a/dom/media/platforms/wmf/MFMediaEngineDecoderModule.h
+++ b/dom/media/platforms/wmf/MFMediaEngineDecoderModule.h
@@ -10,6 +10,8 @@
 
 namespace mozilla {
 
+// MFMediaEngineDecoderModule is used for the media engine playback, which only
+// supports hardware decoding.
 class MFMediaEngineDecoderModule final : public PlatformDecoderModule {
  public:
   static void Init();
diff --git a/dom/media/platforms/wmf/MFMediaEngineStream.cpp b/dom/media/platforms/wmf/MFMediaEngineStream.cpp
index 6dce37ee35..70ffa50142 100644
--- a/dom/media/platforms/wmf/MFMediaEngineStream.cpp
+++ b/dom/media/platforms/wmf/MFMediaEngineStream.cpp
@@ -107,7 +107,11 @@ MFMediaEngineStreamWrapper::NeedsConversion() const {
 }
 
 MFMediaEngineStream::MFMediaEngineStream()
-    : mIsShutdown(false), mIsSelected(false), mReceivedEOS(false) {
+    : mIsShutdown(false),
+      mIsSelected(false),
+      mRawDataQueueForFeedingEngine(true /* aEnablePreciseDuration */),
+      mRawDataQueueForGeneratingOutput(true /* aEnablePreciseDuration */),
+      mReceivedEOS(false) {
   MOZ_COUNT_CTOR(MFMediaEngineStream);
 }
 
@@ -282,17 +286,8 @@ void MFMediaEngineStream::ReplySampleRequestIfPossible() {
     while (!mSampleRequestTokens.empty()) {
       mSampleRequestTokens.pop();
     }
-
-    SLOG("Notify end events");
-    MOZ_ASSERT(mRawDataQueueForFeedingEngine.GetSize() == 0);
     MOZ_ASSERT(mSampleRequestTokens.empty());
-    RETURN_VOID_IF_FAILED(mMediaEventQueue->QueueEventParamUnk(
-        MEEndOfStream, GUID_NULL, S_OK, nullptr));
-    mEndedEvent.Notify(TrackType());
-    PROFILER_MARKER_TEXT(
-        "MFMediaEngineStream:NotifyEnd", MEDIA_PLAYBACK, {},
-        nsPrintfCString("stream=%s, id=%" PRIu64, GetDescriptionName().get(),
-                        mStreamId));
+    NotifyEndEvent();
     return;
   }
 
@@ -318,6 +313,18 @@ void MFMediaEngineStream::ReplySampleRequestIfPossible() {
       MEMediaSample, GUID_NULL, S_OK, inputSample.Get()));
 }
 
+void MFMediaEngineStream::NotifyEndEvent() {
+  AssertOnTaskQueue();
+  SLOG("Notify end event");
+  MOZ_ASSERT(mRawDataQueueForFeedingEngine.GetSize() == 0);
+  RETURN_VOID_IF_FAILED(mMediaEventQueue->QueueEventParamUnk(
+      MEEndOfStream, GUID_NULL, S_OK, nullptr));
+  mEndedEvent.Notify(TrackType());
+  PROFILER_MARKER_TEXT("MFMediaEngineStream:NotifyEnd", MEDIA_PLAYBACK, {},
+                       nsPrintfCString("stream=%s, id=%" PRIu64,
+                                       GetDescriptionName().get(), mStreamId));
+}
+
 bool MFMediaEngineStream::ShouldServeSamples() const {
   AssertOnTaskQueue();
   return mParentSource &&
@@ -486,7 +493,7 @@ void MFMediaEngineStream::NotifyNewData(MediaRawData* aSample) {
         "], queue size=%zu, queue duration=%" PRId64,
         aSample->mTime.ToMicroseconds(), aSample->GetEndTime().ToMicroseconds(),
         mRawDataQueueForFeedingEngine.GetSize(),
-        mRawDataQueueForFeedingEngine.Duration());
+        mRawDataQueueForFeedingEngine.PreciseDuration());
   if (mReceivedEOS) {
     SLOG("Receive a new data, cancel old EOS flag");
     mReceivedEOS = false;
@@ -501,7 +508,7 @@ void MFMediaEngineStream::SendRequestSampleEvent(bool aIsEnough) {
   AssertOnTaskQueue();
   SLOGV("data is %s, queue duration=%" PRId64,
         aIsEnough ? "enough" : "not enough",
-        mRawDataQueueForFeedingEngine.Duration());
+        mRawDataQueueForFeedingEngine.PreciseDuration());
   mParentSource->mRequestSampleEvent.Notify(
       SampleRequest{TrackType(), aIsEnough});
 }
diff --git a/dom/media/platforms/wmf/MFMediaEngineStream.h b/dom/media/platforms/wmf/MFMediaEngineStream.h
index aa3bf7e65d..e11d900498 100644
--- a/dom/media/platforms/wmf/MFMediaEngineStream.h
+++ b/dom/media/platforms/wmf/MFMediaEngineStream.h
@@ -84,7 +84,7 @@ class MFMediaEngineStream
   // Return the type of the track, the result should be either audio or video.
   virtual TrackInfo::TrackType TrackType() = 0;
 
-  RefPtr<MediaDataDecoder::FlushPromise> Flush();
+  virtual RefPtr<MediaDataDecoder::FlushPromise> Flush();
 
   MediaEventProducer<TrackInfo::TrackType>& EndedEvent() { return mEndedEvent; }
 
@@ -93,7 +93,7 @@ class MFMediaEngineStream
 
   virtual MFMediaEngineVideoStream* AsVideoStream() { return nullptr; }
 
-  RefPtr<MediaDataDecoder::DecodePromise> OutputData(
+  virtual RefPtr<MediaDataDecoder::DecodePromise> OutputData(
       RefPtr<MediaRawData> aSample);
 
   virtual RefPtr<MediaDataDecoder::DecodePromise> Drain();
@@ -133,11 +133,13 @@ class MFMediaEngineStream
   // should uses `mRawDataQueueForGeneratingOutput` to generate output.
   virtual already_AddRefed<MediaData> OutputDataInternal() = 0;
 
-  void SendRequestSampleEvent(bool aIsEnough);
+  virtual void SendRequestSampleEvent(bool aIsEnough);
 
   HRESULT AddEncryptAttributes(IMFSample* aSample,
                                const CryptoSample& aCryptoConfig);
 
+  void NotifyEndEvent();
+
   void AssertOnTaskQueue() const;
   void AssertOnMFThreadPool() const;
 
diff --git a/dom/media/platforms/wmf/MFMediaEngineVideoStream.cpp b/dom/media/platforms/wmf/MFMediaEngineVideoStream.cpp
index ca043478f0..0fedcd31b9 100644
--- a/dom/media/platforms/wmf/MFMediaEngineVideoStream.cpp
+++ b/dom/media/platforms/wmf/MFMediaEngineVideoStream.cpp
@@ -49,7 +49,7 @@ void MFMediaEngineVideoStream::SetKnowsCompositor(
        this]() {
         mKnowsCompositor = knowCompositor;
         LOG("Set SetKnowsCompositor=%p", mKnowsCompositor.get());
-        ResolvePendingDrainPromiseIfNeeded();
+        ResolvePendingPromisesIfNeeded();
       }));
 }
 
@@ -74,7 +74,7 @@ void MFMediaEngineVideoStream::SetDCompSurfaceHandle(HANDLE aDCompSurfaceHandle,
           }
         }
         LOG("Set DCompSurfaceHandle, handle=%p", mDCompSurfaceHandle);
-        ResolvePendingDrainPromiseIfNeeded();
+        ResolvePendingPromisesIfNeeded();
       }));
 }
 
@@ -209,7 +209,7 @@ HRESULT MFMediaEngineVideoStream::CreateMediaType(const TrackInfo& aInfo,
 bool MFMediaEngineVideoStream::HasEnoughRawData() const {
   // If more than this much raw video is queued, we'll hold off request more
   // video.
-  return mRawDataQueueForFeedingEngine.Duration() >=
+  return mRawDataQueueForFeedingEngine.PreciseDuration() >=
          StaticPrefs::media_wmf_media_engine_raw_data_threshold_video();
 }
 
@@ -240,6 +240,32 @@ bool MFMediaEngineVideoStream::IsDCompImageReady() {
   return true;
 }
 
+RefPtr<MediaDataDecoder::DecodePromise> MFMediaEngineVideoStream::OutputData(
+    RefPtr<MediaRawData> aSample) {
+  if (IsShutdown()) {
+    return MediaDataDecoder::DecodePromise::CreateAndReject(
+        MediaResult(NS_ERROR_FAILURE,
+                    RESULT_DETAIL("MFMediaEngineStream is shutdown")),
+        __func__);
+  }
+  AssertOnTaskQueue();
+  NotifyNewData(aSample);
+  MediaDataDecoder::DecodedData outputs;
+  if (RefPtr<MediaData> outputData = OutputDataInternal()) {
+    outputs.AppendElement(outputData);
+    LOGV("Output data [%" PRId64 ",%" PRId64 "]",
+         outputData->mTime.ToMicroseconds(),
+         outputData->GetEndTime().ToMicroseconds());
+  }
+  if (ShouldDelayVideoDecodeBeforeDcompReady()) {
+    LOG("Dcomp isn't ready and we already have enough video data. We will send "
+        "them back together at one when Dcomp is ready");
+    return mVideoDecodeBeforeDcompPromise.Ensure(__func__);
+  }
+  return MediaDataDecoder::DecodePromise::CreateAndResolve(std::move(outputs),
+                                                           __func__);
+}
+
 already_AddRefed<MediaData> MFMediaEngineVideoStream::OutputDataInternal() {
   AssertOnTaskQueue();
   if (mRawDataQueueForGeneratingOutput.GetSize() == 0 || !IsDCompImageReady()) {
@@ -261,28 +287,62 @@ RefPtr<MediaDataDecoder::DecodePromise> MFMediaEngineVideoStream::Drain() {
   MediaDataDecoder::DecodedData outputs;
   if (!IsDCompImageReady()) {
     LOGV("Waiting for dcomp image for draining");
+    // A workaround for a special case where we have sent all input data to the
+    // media engine, and waiting for an output. Sometime media engine would
+    // never return the first frame to us, unless we notify it the end event,
+    // which happens on the case where the video only contains one frame. If we
+    // don't send end event to the media engine, the drain promise would be
+    // pending forever.
+    if (!mSampleRequestTokens.empty() &&
+        mRawDataQueueForFeedingEngine.GetSize() == 0) {
+      NotifyEndEvent();
+    }
     return mPendingDrainPromise.Ensure(__func__);
   }
   return MFMediaEngineStream::Drain();
 }
 
-void MFMediaEngineVideoStream::ResolvePendingDrainPromiseIfNeeded() {
+RefPtr<MediaDataDecoder::FlushPromise> MFMediaEngineVideoStream::Flush() {
+  AssertOnTaskQueue();
+  auto promise = MFMediaEngineStream::Flush();
+  mPendingDrainPromise.RejectIfExists(NS_ERROR_DOM_MEDIA_CANCELED, __func__);
+  mVideoDecodeBeforeDcompPromise.RejectIfExists(NS_ERROR_DOM_MEDIA_CANCELED,
+                                                __func__);
+  return promise;
+}
+
+void MFMediaEngineVideoStream::ResolvePendingPromisesIfNeeded() {
   AssertOnTaskQueue();
-  if (mPendingDrainPromise.IsEmpty()) {
-    return;
-  }
   if (!IsDCompImageReady()) {
     return;
   }
-  MediaDataDecoder::DecodedData outputs;
-  while (RefPtr<MediaData> outputData = OutputDataInternal()) {
-    outputs.AppendElement(outputData);
-    LOGV("Output data [%" PRId64 ",%" PRId64 "]",
-         outputData->mTime.ToMicroseconds(),
-         outputData->GetEndTime().ToMicroseconds());
+
+  // Resolve decoding promise first, then drain promise
+  if (!mVideoDecodeBeforeDcompPromise.IsEmpty()) {
+    MediaDataDecoder::DecodedData outputs;
+    while (RefPtr<MediaData> outputData = OutputDataInternal()) {
+      outputs.AppendElement(outputData);
+      LOGV("Output data [%" PRId64 ",%" PRId64 "]",
+           outputData->mTime.ToMicroseconds(),
+           outputData->GetEndTime().ToMicroseconds());
+    }
+    mVideoDecodeBeforeDcompPromise.Resolve(std::move(outputs), __func__);
+    LOG("Resolved video decode before Dcomp promise");
+  }
+
+  // This drain promise could return no data, if all data has been processed in
+  // the decoding promise.
+  if (!mPendingDrainPromise.IsEmpty()) {
+    MediaDataDecoder::DecodedData outputs;
+    while (RefPtr<MediaData> outputData = OutputDataInternal()) {
+      outputs.AppendElement(outputData);
+      LOGV("Output data [%" PRId64 ",%" PRId64 "]",
+           outputData->mTime.ToMicroseconds(),
+           outputData->GetEndTime().ToMicroseconds());
+    }
+    mPendingDrainPromise.Resolve(std::move(outputs), __func__);
+    LOG("Resolved pending drain promise");
   }
-  mPendingDrainPromise.Resolve(std::move(outputs), __func__);
-  LOG("Resolved pending drain promise");
 }
 
 MediaDataDecoder::ConversionRequired MFMediaEngineVideoStream::NeedsConversion()
@@ -336,6 +396,20 @@ void MFMediaEngineVideoStream::UpdateConfig(const VideoInfo& aInfo) {
 void MFMediaEngineVideoStream::ShutdownCleanUpOnTaskQueue() {
   AssertOnTaskQueue();
   mPendingDrainPromise.RejectIfExists(NS_ERROR_DOM_MEDIA_CANCELED, __func__);
+  mVideoDecodeBeforeDcompPromise.RejectIfExists(NS_ERROR_DOM_MEDIA_CANCELED,
+                                                __func__);
+}
+
+void MFMediaEngineVideoStream::SendRequestSampleEvent(bool aIsEnough) {
+  AssertOnTaskQueue();
+  MFMediaEngineStream::SendRequestSampleEvent(aIsEnough);
+  // We need more data to be sent in, we should resolve the promise to allow
+  // more input data to be sent.
+  if (!aIsEnough && !mVideoDecodeBeforeDcompPromise.IsEmpty()) {
+    LOG("Resolved pending input promise to allow more input be sent in");
+    mVideoDecodeBeforeDcompPromise.Resolve(MediaDataDecoder::DecodedData{},
+                                           __func__);
+  }
 }
 
 bool MFMediaEngineVideoStream::IsEnded() const {
@@ -352,6 +426,10 @@ bool MFMediaEngineVideoStream::IsEnded() const {
 
 bool MFMediaEngineVideoStream::IsEncrypted() const { return mIsEncrypted; }
 
+bool MFMediaEngineVideoStream::ShouldDelayVideoDecodeBeforeDcompReady() {
+  return HasEnoughRawData() && !IsDCompImageReady();
+}
+
 nsCString MFMediaEngineVideoStream::GetCodecName() const {
   switch (mStreamType) {
     case WMFStreamType::H264:
diff --git a/dom/media/platforms/wmf/MFMediaEngineVideoStream.h b/dom/media/platforms/wmf/MFMediaEngineVideoStream.h
index df17c264e4..51fbe4876b 100644
--- a/dom/media/platforms/wmf/MFMediaEngineVideoStream.h
+++ b/dom/media/platforms/wmf/MFMediaEngineVideoStream.h
@@ -19,6 +19,7 @@ class DcompSurfaceImage;
 }  // namespace layers
 
 class MFMediaSource;
+class MediaRawData;
 
 class MFMediaEngineVideoStream final : public MFMediaEngineStream {
  public:
@@ -50,8 +51,13 @@ class MFMediaEngineVideoStream final : public MFMediaEngineStream {
   // change happens during playback.
   void SetConfig(const TrackInfo& aConfig);
 
+  RefPtr<MediaDataDecoder::DecodePromise> OutputData(
+      RefPtr<MediaRawData> aSample) override;
+
   RefPtr<MediaDataDecoder::DecodePromise> Drain() override;
 
+  RefPtr<MediaDataDecoder::FlushPromise> Flush() override;
+
   bool IsEncrypted() const override;
 
  private:
@@ -66,12 +72,25 @@ class MFMediaEngineVideoStream final : public MFMediaEngineStream {
 
   bool IsDCompImageReady();
 
-  void ResolvePendingDrainPromiseIfNeeded();
+  // Those promises are used to handle decode/drain which happens before the
+  // Dcomp surface is ready.
+  void ResolvePendingPromisesIfNeeded();
 
   void ShutdownCleanUpOnTaskQueue() override;
 
   bool IsEnded() const override;
 
+  // Before Dcomp surface is ready, we can't return any video data due to
+  // lacking of the image, which should only happen on the beginning of the
+  // video playback. In that situation, once we have enough video raw data, we
+  // can stop delaying the decode promise by waiting the Dcomp surface and
+  // resolveing the promise when Dcomp surface is ready. Doing so helps to keep
+  // the decode promise pending, so that the MFR won't keep sending more input
+  // data, which we actually don't need that many.
+  bool ShouldDelayVideoDecodeBeforeDcompReady();
+
+  void SendRequestSampleEvent(bool aIsEnough) override;
+
   // Task queue only members.
   HANDLE mDCompSurfaceHandle;
   bool mNeedRecreateImage;
@@ -98,6 +117,12 @@ class MFMediaEngineVideoStream final : public MFMediaEngineStream {
   // have dcomp image.
   MozPromiseHolder<MediaDataDecoder::DecodePromise> mPendingDrainPromise;
 
+  // The promise used to return all video output which are requested before the
+  // Dcomp surface is ready. This should only be used once in entire playback,
+  // typically happening around the beginning of the playback.
+  MozPromiseHolder<MediaDataDecoder::DecodePromise>
+      mVideoDecodeBeforeDcompPromise;
+
   // Set when `CreateMediaType()` is called.
   bool mIsEncrypted = false;
 };
diff --git a/dom/media/platforms/wmf/MFMediaSource.h b/dom/media/platforms/wmf/MFMediaSource.h
index 735d53579e..0e44ef12aa 100644
--- a/dom/media/platforms/wmf/MFMediaSource.h
+++ b/dom/media/platforms/wmf/MFMediaSource.h
@@ -132,8 +132,6 @@ class MFMediaSource : public Microsoft::WRL::RuntimeClass<
   void AssertOnManagerThread() const;
   void AssertOnMFThreadPool() const;
 
-  void NotifyEndOfStreamInternal(TrackInfo::TrackType aType);
-
   bool IsSeekable() const;
 
   // A thread-safe event queue.
diff --git a/dom/media/platforms/wmf/WMFAudioMFTManager.cpp b/dom/media/platforms/wmf/WMFAudioMFTManager.cpp
index 6ebcf9a80a..63db5efae8 100644
--- a/dom/media/platforms/wmf/WMFAudioMFTManager.cpp
+++ b/dom/media/platforms/wmf/WMFAudioMFTManager.cpp
@@ -55,6 +55,9 @@ WMFAudioMFTManager::WMFAudioMFTManager(const AudioInfo& aConfig)
       audioSpecConfig = audioCodecSpecificBinaryBlob->Elements();
       configLength = audioCodecSpecificBinaryBlob->Length();
     }
+    // If no extradata has been provided, assume this is ADTS. Otherwise,
+    // assume raw AAC packets.
+    mIsADTS = !configLength;
     AACAudioSpecificConfigToUserData(aConfig.mExtendedProfile, audioSpecConfig,
                                      configLength, mUserData);
   }
@@ -104,7 +107,8 @@ bool WMFAudioMFTManager::Init() {
   NS_ENSURE_TRUE(SUCCEEDED(hr), false);
 
   if (mStreamType == WMFStreamType::AAC) {
-    hr = inputType->SetUINT32(MF_MT_AAC_PAYLOAD_TYPE, 0x0);  // Raw AAC packet
+    UINT32 payloadType = mIsADTS ? 1 : 0;
+    hr = inputType->SetUINT32(MF_MT_AAC_PAYLOAD_TYPE, payloadType);
     NS_ENSURE_TRUE(SUCCEEDED(hr), false);
 
     hr = inputType->SetBlob(MF_MT_USER_DATA, mUserData.Elements(),
@@ -144,7 +148,8 @@ WMFAudioMFTManager::Input(MediaRawData* aSample) {
 nsCString WMFAudioMFTManager::GetCodecName() const {
   if (mStreamType == WMFStreamType::AAC) {
     return "aac"_ns;
-  } else if (mStreamType == WMFStreamType::MP3) {
+  }
+  if (mStreamType == WMFStreamType::MP3) {
     return "mp3"_ns;
   }
   return "unknown"_ns;
@@ -177,8 +182,8 @@ WMFAudioMFTManager::UpdateOutputType() {
 }
 
 HRESULT
-WMFAudioMFTManager::Output(int64_t aStreamOffset, RefPtr<MediaData>& aOutData) {
-  aOutData = nullptr;
+WMFAudioMFTManager::Output(int64_t aStreamOffset, RefPtr<MediaData>& aOutput) {
+  aOutput = nullptr;
   RefPtr<IMFSample> sample;
   HRESULT hr;
   int typeChangeCount = 0;
@@ -242,8 +247,8 @@ WMFAudioMFTManager::Output(int64_t aStreamOffset, RefPtr<MediaData>& aOutData) {
   NS_ENSURE_TRUE(SUCCEEDED(hr), hr);
 
   // Output is made of floats.
-  int32_t numSamples = currentLength / sizeof(float);
-  int32_t numFrames = numSamples / mAudioChannels;
+  uint32_t numSamples = currentLength / sizeof(float);
+  uint32_t numFrames = numSamples / mAudioChannels;
   MOZ_ASSERT(numFrames >= 0);
   MOZ_ASSERT(numSamples >= 0);
   if (numFrames == 0) {
@@ -275,10 +280,10 @@ WMFAudioMFTManager::Output(int64_t aStreamOffset, RefPtr<MediaData>& aOutData) {
     return MF_E_TRANSFORM_NEED_MORE_INPUT;
   }
 
-  aOutData = new AudioData(aStreamOffset, pts, std::move(audioData),
-                           mAudioChannels, mAudioRate, mChannelsMap);
-  MOZ_DIAGNOSTIC_ASSERT(duration == aOutData->mDuration, "must be equal");
-  mLastOutputDuration = aOutData->mDuration;
+  aOutput = new AudioData(aStreamOffset, pts, std::move(audioData),
+                          mAudioChannels, mAudioRate, mChannelsMap);
+  MOZ_DIAGNOSTIC_ASSERT(duration == aOutput->mDuration, "must be equal");
+  mLastOutputDuration = aOutput->mDuration;
 
 #ifdef LOG_SAMPLE_DECODE
   LOG("Decoded audio sample! timestamp=%lld duration=%lld currentLength=%u",
diff --git a/dom/media/platforms/wmf/WMFAudioMFTManager.h b/dom/media/platforms/wmf/WMFAudioMFTManager.h
index b5dc379396..f772593545 100644
--- a/dom/media/platforms/wmf/WMFAudioMFTManager.h
+++ b/dom/media/platforms/wmf/WMFAudioMFTManager.h
@@ -58,6 +58,7 @@ class WMFAudioMFTManager : public MFTManager {
   media::TimeUnit mLastOutputDuration = media::TimeUnit::Zero();
 
   bool mFirstFrame = true;
+  bool mIsADTS = false;
 
   uint64_t mTotalMediaFrames = 0;
   uint32_t mEncoderDelay = 0;
diff --git a/dom/media/platforms/wmf/WMFMediaDataEncoder.h b/dom/media/platforms/wmf/WMFMediaDataEncoder.h
index 13848b47ad..31a63c8347 100644
--- a/dom/media/platforms/wmf/WMFMediaDataEncoder.h
+++ b/dom/media/platforms/wmf/WMFMediaDataEncoder.h
@@ -202,7 +202,9 @@ class WMFMediaDataEncoder final : public MediaDataEncoder {
     MOZ_ASSERT(mEncoder);
 
     const layers::PlanarYCbCrImage* image = aData->mImage->AsPlanarYCbCrImage();
-    MOZ_ASSERT(image);
+    // TODO: Take care non planar Y-Cb-Cr image (Bug 1881647).
+    NS_ENSURE_TRUE(image, nullptr);
+
     const layers::PlanarYCbCrData* yuv = image->GetData();
     auto ySize = yuv->YDataSize();
     auto cbcrSize = yuv->CbCrDataSize();
@@ -223,6 +225,7 @@ class WMFMediaDataEncoder final : public MediaDataEncoder {
     LockBuffer lockBuffer(buffer);
     NS_ENSURE_TRUE(SUCCEEDED(lockBuffer.Result()), nullptr);
 
+    // TODO: Take care non I420 image (Bug 1881647).
     bool ok = libyuv::I420ToNV12(
                   yuv->mYChannel, yuv->mYStride, yuv->mCbChannel,
                   yuv->mCbCrStride, yuv->mCrChannel, yuv->mCbCrStride,
diff --git a/dom/media/platforms/wmf/WMFUtils.cpp b/dom/media/platforms/wmf/WMFUtils.cpp
index d096979919..dda9df808e 100644
--- a/dom/media/platforms/wmf/WMFUtils.cpp
+++ b/dom/media/platforms/wmf/WMFUtils.cpp
@@ -177,7 +177,8 @@ Maybe<gfx::YUVColorSpace> GetYUVColorSpace(IMFMediaType* aType) {
 }
 
 int32_t MFOffsetToInt32(const MFOffset& aOffset) {
-  return int32_t(aOffset.value + (aOffset.fract / 65536.0f));
+  return AssertedCast<int32_t>(AssertedCast<float>(aOffset.value) +
+                               (AssertedCast<float>(aOffset.fract) / 65536.0f));
 }
 
 TimeUnit GetSampleDuration(IMFSample* aSample) {
@@ -204,7 +205,7 @@ GetPictureRegion(IMFMediaType* aMediaType, gfx::IntRect& aOutPictureRegion) {
   // Determine if "pan and scan" is enabled for this media. If it is, we
   // only display a region of the video frame, not the entire frame.
   BOOL panScan =
-      MFGetAttributeUINT32(aMediaType, MF_MT_PAN_SCAN_ENABLED, FALSE);
+      !!MFGetAttributeUINT32(aMediaType, MF_MT_PAN_SCAN_ENABLED, FALSE);
 
   // If pan and scan mode is enabled. Try to get the display region.
   HRESULT hr = E_FAIL;
@@ -300,11 +301,14 @@ const char* MFTMessageTypeToStr(MFT_MESSAGE_TYPE aMsg) {
 GUID AudioMimeTypeToMediaFoundationSubtype(const nsACString& aMimeType) {
   if (aMimeType.EqualsLiteral("audio/mpeg")) {
     return MFAudioFormat_MP3;
-  } else if (MP4Decoder::IsAAC(aMimeType)) {
+  }
+  if (MP4Decoder::IsAAC(aMimeType)) {
     return MFAudioFormat_AAC;
-  } else if (aMimeType.EqualsLiteral("audio/vorbis")) {
+  }
+  if (aMimeType.EqualsLiteral("audio/vorbis")) {
     return MFAudioFormat_Vorbis;
-  } else if (aMimeType.EqualsLiteral("audio/opus")) {
+  }
+  if (aMimeType.EqualsLiteral("audio/opus")) {
     return MFAudioFormat_Opus;
   }
   NS_WARNING("Unsupport audio mimetype");
@@ -314,17 +318,19 @@ GUID AudioMimeTypeToMediaFoundationSubtype(const nsACString& aMimeType) {
 GUID VideoMimeTypeToMediaFoundationSubtype(const nsACString& aMimeType) {
   if (MP4Decoder::IsH264(aMimeType)) {
     return MFVideoFormat_H264;
-  } else if (VPXDecoder::IsVP8(aMimeType)) {
+  }
+  if (VPXDecoder::IsVP8(aMimeType)) {
     return MFVideoFormat_VP80;
-  } else if (VPXDecoder::IsVP9(aMimeType)) {
+  }
+  if (VPXDecoder::IsVP9(aMimeType)) {
     return MFVideoFormat_VP90;
   }
 #ifdef MOZ_AV1
-  else if (AOMDecoder::IsAV1(aMimeType)) {
+  if (AOMDecoder::IsAV1(aMimeType)) {
     return MFVideoFormat_AV1;
   }
 #endif
-  else if (MP4Decoder::IsHEVC(aMimeType)) {
+  if (MP4Decoder::IsHEVC(aMimeType)) {
     return MFVideoFormat_HEVC;
   }
   NS_WARNING("Unsupport video mimetype");
@@ -368,7 +374,9 @@ void AACAudioSpecificConfigToUserData(uint8_t aAACProfileLevelIndication,
   // the rest can be all 0x00.
   BYTE heeInfo[heeInfoLen] = {0};
   WORD* w = (WORD*)heeInfo;
-  w[0] = 0x0;  // Payload type raw AAC packet
+  // If extradata has been provided, assume raw AAC packets (0). Otherwise,
+  // assume ADTS (1)
+  w[0] = aConfigLength ? 0 : 1;
   w[1] = aAACProfileLevelIndication;
 
   aOutUserData.AppendElements(heeInfo, heeInfoLen);
@@ -377,10 +385,10 @@ void AACAudioSpecificConfigToUserData(uint8_t aAACProfileLevelIndication,
     // The AudioSpecificConfig is TTTTTFFF|FCCCCGGG
     // (T=ObjectType, F=Frequency, C=Channel, G=GASpecificConfig)
     // If frequency = 0xf, then the frequency is explicitly defined on 24 bits.
-    int8_t frequency =
+    uint8_t frequency =
         (aAudioSpecConfig[0] & 0x7) << 1 | (aAudioSpecConfig[1] & 0x80) >> 7;
-    int8_t channels = (aAudioSpecConfig[1] & 0x78) >> 3;
-    int8_t gasc = aAudioSpecConfig[1] & 0x7;
+    uint8_t channels = (aAudioSpecConfig[1] & 0x78) >> 3;
+    uint8_t gasc = aAudioSpecConfig[1] & 0x7;
     if (frequency != 0xf && channels && !gasc) {
       // We enter this condition if the AudioSpecificConfig should theorically
       // be 2 bytes long but it's not.
-- 
cgit v1.2.3