1 files changed, 364 insertions, 0 deletions
diff --git a/dom/media/mp4/MoofParser.h b/dom/media/mp4/MoofParser.h
new file mode 100644
index 0000000000..9099df7d14
--- /dev/null
+++ b/dom/media/mp4/MoofParser.h
@@ -0,0 +1,364 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOOF_PARSER_H_
+#define MOOF_PARSER_H_
+
+#include "mozilla/ResultExtensions.h"
+#include "mozilla/Variant.h"
+#include "Atom.h"
+#include "AtomType.h"
+#include "SinfParser.h"
+#include "ByteStream.h"
+#include "MP4Interval.h"
+#include "MediaResource.h"
+
+namespace mozilla {
+
+typedef int64_t Microseconds;
+
+class Box;
+class BoxContext;
+class BoxReader;
+class Moof;
+
+// Used to track the CTS end time of the last sample of a track
+// in the preceeding Moof, so that we can smooth tracks' timestamps
+// across Moofs.
+struct TrackEndCts {
+  TrackEndCts(uint32_t aTrackId, Microseconds aCtsEndTime)
+      : mTrackId(aTrackId), mCtsEndTime(aCtsEndTime) {}
+  uint32_t mTrackId;
+  Microseconds mCtsEndTime;
+};
+
+class Mvhd : public Atom {
+ public:
+  Mvhd()
+      : mCreationTime(0), mModificationTime(0), mTimescale(0), mDuration(0) {}
+  explicit Mvhd(Box& aBox);
+
+  Result<Microseconds, nsresult> ToMicroseconds(int64_t aTimescaleUnits) {
+    if (!mTimescale) {
+      NS_WARNING("invalid mTimescale");
+      return Err(NS_ERROR_FAILURE);
+    }
+    int64_t major = aTimescaleUnits / mTimescale;
+    int64_t remainder = aTimescaleUnits % mTimescale;
+    return major * 1000000ll + remainder * 1000000ll / mTimescale;
+  }
+
+  uint64_t mCreationTime;
+  uint64_t mModificationTime;
+  uint32_t mTimescale;
+  uint64_t mDuration;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+class Tkhd : public Mvhd {
+ public:
+  Tkhd() : mTrackId(0) {}
+  explicit Tkhd(Box& aBox);
+
+  uint32_t mTrackId;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+class Mdhd : public Mvhd {
+ public:
+  Mdhd() = default;
+  explicit Mdhd(Box& aBox);
+};
+
+class Trex : public Atom {
+ public:
+  explicit Trex(uint32_t aTrackId)
+      : mFlags(0),
+        mTrackId(aTrackId),
+        mDefaultSampleDescriptionIndex(0),
+        mDefaultSampleDuration(0),
+        mDefaultSampleSize(0),
+        mDefaultSampleFlags(0) {}
+
+  explicit Trex(Box& aBox);
+
+  uint32_t mFlags;
+  uint32_t mTrackId;
+  uint32_t mDefaultSampleDescriptionIndex;
+  uint32_t mDefaultSampleDuration;
+  uint32_t mDefaultSampleSize;
+  uint32_t mDefaultSampleFlags;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+class Tfhd : public Trex {
+ public:
+  explicit Tfhd(Trex& aTrex) : Trex(aTrex), mBaseDataOffset(0) {
+    mValid = aTrex.IsValid();
+  }
+  Tfhd(Box& aBox, Trex& aTrex);
+
+  uint64_t mBaseDataOffset;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+class Tfdt : public Atom {
+ public:
+  Tfdt() : mBaseMediaDecodeTime(0) {}
+  explicit Tfdt(Box& aBox);
+
+  uint64_t mBaseMediaDecodeTime;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+class Edts : public Atom {
+ public:
+  Edts() : mMediaStart(0), mEmptyOffset(0) {}
+  explicit Edts(Box& aBox);
+  virtual bool IsValid() override {
+    // edts is optional
+    return true;
+  }
+
+  int64_t mMediaStart;
+  int64_t mEmptyOffset;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+struct Sample {
+  mozilla::MediaByteRange mByteRange;
+  mozilla::MediaByteRange mCencRange;
+  Microseconds mDecodeTime;
+  MP4Interval<Microseconds> mCompositionRange;
+  bool mSync;
+};
+
+class Saiz final : public Atom {
+ public:
+  Saiz(Box& aBox, AtomType aDefaultType);
+
+  AtomType mAuxInfoType;
+  uint32_t mAuxInfoTypeParameter;
+  FallibleTArray<uint8_t> mSampleInfoSize;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+class Saio final : public Atom {
+ public:
+  Saio(Box& aBox, AtomType aDefaultType);
+
+  AtomType mAuxInfoType;
+  uint32_t mAuxInfoTypeParameter;
+  FallibleTArray<uint64_t> mOffsets;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+struct SampleToGroupEntry {
+ public:
+  static const uint32_t kTrackGroupDescriptionIndexBase = 0;
+  static const uint32_t kFragmentGroupDescriptionIndexBase = 0x10000;
+
+  SampleToGroupEntry(uint32_t aSampleCount, uint32_t aGroupDescriptionIndex)
+      : mSampleCount(aSampleCount),
+        mGroupDescriptionIndex(aGroupDescriptionIndex) {}
+
+  uint32_t mSampleCount;
+  uint32_t mGroupDescriptionIndex;
+};
+
+class Sbgp final : public Atom  // SampleToGroup box.
+{
+ public:
+  explicit Sbgp(Box& aBox);
+
+  AtomType mGroupingType;
+  uint32_t mGroupingTypeParam;
+  FallibleTArray<SampleToGroupEntry> mEntries;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+// Stores information form CencSampleEncryptionInformationGroupEntry (seig).
+// Cenc here refers to the common encryption standard, rather than the specific
+// cenc scheme from that standard. This structure is used for all encryption
+// schemes. I.e. it is used for both cenc and cbcs, not just cenc.
+struct CencSampleEncryptionInfoEntry final {
+ public:
+  CencSampleEncryptionInfoEntry() = default;
+
+  Result<Ok, nsresult> Init(BoxReader& aReader);
+
+  bool mIsEncrypted = false;
+  uint8_t mIVSize = 0;
+  CopyableTArray<uint8_t> mKeyId;
+  uint8_t mCryptByteBlock = 0;
+  uint8_t mSkipByteBlock = 0;
+  CopyableTArray<uint8_t> mConsantIV;
+};
+
+class Sgpd final : public Atom  // SampleGroupDescription box.
+{
+ public:
+  explicit Sgpd(Box& aBox);
+
+  AtomType mGroupingType;
+  FallibleTArray<CencSampleEncryptionInfoEntry> mEntries;
+
+ protected:
+  Result<Ok, nsresult> Parse(Box& aBox);
+};
+
+// Audio/video entries from the sample description box (stsd). We only need to
+// store if these are encrypted, so do not need a specialized class for
+// different audio and video data. Currently most of the parsing of these
+// entries is by the mp4parse-rust, but moof pasrser needs to know which of
+// these are encrypted when parsing the track fragment header (tfhd).
+struct SampleDescriptionEntry {
+  bool mIsEncryptedEntry = false;
+};
+
+// Used to indicate in variants if all tracks should be parsed.
+struct ParseAllTracks {};
+
+typedef Variant<ParseAllTracks, uint32_t> TrackParseMode;
+
+class Moof final : public Atom {
+ public:
+  Moof(Box& aBox, const TrackParseMode& aTrackParseMode, Trex& aTrex,
+       Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf,
+       uint64_t* aDecodeTime, bool aIsAudio,
+       nsTArray<TrackEndCts>& aTracksEndCts);
+  bool GetAuxInfo(AtomType aType, FallibleTArray<MediaByteRange>* aByteRanges);
+  void FixRounding(const Moof& aMoof);
+
+  mozilla::MediaByteRange mRange;
+  mozilla::MediaByteRange mMdatRange;
+  MP4Interval<Microseconds> mTimeRange;
+  FallibleTArray<Sample> mIndex;
+
+  FallibleTArray<CencSampleEncryptionInfoEntry>
+      mFragmentSampleEncryptionInfoEntries;
+  FallibleTArray<SampleToGroupEntry> mFragmentSampleToGroupEntries;
+
+  Tfhd mTfhd;
+  FallibleTArray<Saiz> mSaizs;
+  FallibleTArray<Saio> mSaios;
+  nsTArray<nsTArray<uint8_t>> mPsshes;
+
+ private:
+  // aDecodeTime is updated to the end of the parsed TRAF on return.
+  void ParseTraf(Box& aBox, const TrackParseMode& aTrackParseMode, Trex& aTrex,
+                 Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf,
+                 uint64_t* aDecodeTime, bool aIsAudio);
+  // aDecodeTime is updated to the end of the parsed TRUN on return.
+  Result<Ok, nsresult> ParseTrun(Box& aBox, Mvhd& aMvhd, Mdhd& aMdhd,
+                                 Edts& aEdts, uint64_t* aDecodeTime,
+                                 bool aIsAudio);
+  // Process the sample auxiliary information used by common encryption.
+  // aScheme is used to select the appropriate auxiliary information and should
+  // be set based on the encryption scheme used by the track being processed.
+  // Note, the term cenc here refers to the standard, not the specific scheme
+  // from that standard. I.e. this function is used to handle up auxiliary
+  // information from the cenc and cbcs schemes.
+  bool ProcessCencAuxInfo(AtomType aScheme);
+  uint64_t mMaxRoundingError;
+};
+
+DDLoggedTypeDeclName(MoofParser);
+
+class MoofParser : public DecoderDoctorLifeLogger<MoofParser> {
+ public:
+  MoofParser(ByteStream* aSource, const TrackParseMode& aTrackParseMode,
+             bool aIsAudio)
+      : mSource(aSource),
+        mOffset(0),
+        mTrex(aTrackParseMode.is<uint32_t>() ? aTrackParseMode.as<uint32_t>()
+                                             : 0),
+        mIsAudio(aIsAudio),
+        mLastDecodeTime(0),
+        mTrackParseMode(aTrackParseMode) {
+    // Setting mIsMultitrackParser is a nasty work around for calculating
+    // the composition range for MSE that causes the parser to parse multiple
+    // tracks. Ideally we'd store an array of tracks with different metadata
+    // for each.
+    DDLINKCHILD("source", aSource);
+  }
+  bool RebuildFragmentedIndex(const mozilla::MediaByteRangeSet& aByteRanges);
+  // If *aCanEvict is set to true. then will remove all moofs already parsed
+  // from index then rebuild the index. *aCanEvict is set to true upon return if
+  // some moofs were removed.
+  bool RebuildFragmentedIndex(const mozilla::MediaByteRangeSet& aByteRanges,
+                              bool* aCanEvict);
+  bool RebuildFragmentedIndex(BoxContext& aContext);
+  MP4Interval<Microseconds> GetCompositionRange(
+      const mozilla::MediaByteRangeSet& aByteRanges);
+  bool ReachedEnd();
+  void ParseMoov(Box& aBox);
+  void ParseTrak(Box& aBox);
+  void ParseMdia(Box& aBox);
+  void ParseMvex(Box& aBox);
+
+  void ParseMinf(Box& aBox);
+  void ParseStbl(Box& aBox);
+  void ParseStsd(Box& aBox);
+  void ParseEncrypted(Box& aBox);
+
+  bool BlockingReadNextMoof();
+
+  already_AddRefed<mozilla::MediaByteBuffer> Metadata();
+  MediaByteRange FirstCompleteMediaSegment();
+  MediaByteRange FirstCompleteMediaHeader();
+
+  mozilla::MediaByteRange mInitRange;
+  RefPtr<ByteStream> mSource;
+  uint64_t mOffset;
+  Mvhd mMvhd;
+  Mdhd mMdhd;
+  Trex mTrex;
+  Tfdt mTfdt;
+  Edts mEdts;
+  Sinf mSinf;
+
+  FallibleTArray<CencSampleEncryptionInfoEntry>
+      mTrackSampleEncryptionInfoEntries;
+  FallibleTArray<SampleToGroupEntry> mTrackSampleToGroupEntries;
+  FallibleTArray<SampleDescriptionEntry> mSampleDescriptions;
+
+  nsTArray<Moof>& Moofs() { return mMoofs; }
+
+ private:
+  void ScanForMetadata(mozilla::MediaByteRange& aMoov);
+  nsTArray<Moof> mMoofs;
+  nsTArray<MediaByteRange> mMediaRanges;
+  nsTArray<TrackEndCts> mTracksEndCts;
+  bool mIsAudio;
+  uint64_t mLastDecodeTime;
+  // Either a ParseAllTracks if in multitrack mode, or an integer representing
+  // the track_id for the track being parsed. If parsing a specific track, mTrex
+  // should have an id matching mTrackParseMode.as<uint32_t>(). In this case 0
+  // is a valid track id -- this is not allowed in the spec, but such mp4s
+  // appear in the wild. In the ParseAllTracks case, mTrex can have an arbitrary
+  // id based on the tracks being parsed.
+  const TrackParseMode mTrackParseMode;
+};
+}  // namespace mozilla
+
+#endif