summaryrefslogtreecommitdiffstats
path: root/dom/media/mp4/MoofParser.h
blob: 9099df7d148cbf82c1a4486edd2d6005cf234cdb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef MOOF_PARSER_H_
#define MOOF_PARSER_H_

#include "mozilla/ResultExtensions.h"
#include "mozilla/Variant.h"
#include "Atom.h"
#include "AtomType.h"
#include "SinfParser.h"
#include "ByteStream.h"
#include "MP4Interval.h"
#include "MediaResource.h"

namespace mozilla {

typedef int64_t Microseconds;

class Box;
class BoxContext;
class BoxReader;
class Moof;

// Used to track the CTS end time of the last sample of a track
// in the preceeding Moof, so that we can smooth tracks' timestamps
// across Moofs.
struct TrackEndCts {
  TrackEndCts(uint32_t aTrackId, Microseconds aCtsEndTime)
      : mTrackId(aTrackId), mCtsEndTime(aCtsEndTime) {}
  uint32_t mTrackId;
  Microseconds mCtsEndTime;
};

class Mvhd : public Atom {
 public:
  Mvhd()
      : mCreationTime(0), mModificationTime(0), mTimescale(0), mDuration(0) {}
  explicit Mvhd(Box& aBox);

  Result<Microseconds, nsresult> ToMicroseconds(int64_t aTimescaleUnits) {
    if (!mTimescale) {
      NS_WARNING("invalid mTimescale");
      return Err(NS_ERROR_FAILURE);
    }
    int64_t major = aTimescaleUnits / mTimescale;
    int64_t remainder = aTimescaleUnits % mTimescale;
    return major * 1000000ll + remainder * 1000000ll / mTimescale;
  }

  uint64_t mCreationTime;
  uint64_t mModificationTime;
  uint32_t mTimescale;
  uint64_t mDuration;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

class Tkhd : public Mvhd {
 public:
  Tkhd() : mTrackId(0) {}
  explicit Tkhd(Box& aBox);

  uint32_t mTrackId;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

class Mdhd : public Mvhd {
 public:
  Mdhd() = default;
  explicit Mdhd(Box& aBox);
};

class Trex : public Atom {
 public:
  explicit Trex(uint32_t aTrackId)
      : mFlags(0),
        mTrackId(aTrackId),
        mDefaultSampleDescriptionIndex(0),
        mDefaultSampleDuration(0),
        mDefaultSampleSize(0),
        mDefaultSampleFlags(0) {}

  explicit Trex(Box& aBox);

  uint32_t mFlags;
  uint32_t mTrackId;
  uint32_t mDefaultSampleDescriptionIndex;
  uint32_t mDefaultSampleDuration;
  uint32_t mDefaultSampleSize;
  uint32_t mDefaultSampleFlags;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

class Tfhd : public Trex {
 public:
  explicit Tfhd(Trex& aTrex) : Trex(aTrex), mBaseDataOffset(0) {
    mValid = aTrex.IsValid();
  }
  Tfhd(Box& aBox, Trex& aTrex);

  uint64_t mBaseDataOffset;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

class Tfdt : public Atom {
 public:
  Tfdt() : mBaseMediaDecodeTime(0) {}
  explicit Tfdt(Box& aBox);

  uint64_t mBaseMediaDecodeTime;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

class Edts : public Atom {
 public:
  Edts() : mMediaStart(0), mEmptyOffset(0) {}
  explicit Edts(Box& aBox);
  virtual bool IsValid() override {
    // edts is optional
    return true;
  }

  int64_t mMediaStart;
  int64_t mEmptyOffset;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

struct Sample {
  mozilla::MediaByteRange mByteRange;
  mozilla::MediaByteRange mCencRange;
  Microseconds mDecodeTime;
  MP4Interval<Microseconds> mCompositionRange;
  bool mSync;
};

class Saiz final : public Atom {
 public:
  Saiz(Box& aBox, AtomType aDefaultType);

  AtomType mAuxInfoType;
  uint32_t mAuxInfoTypeParameter;
  FallibleTArray<uint8_t> mSampleInfoSize;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

class Saio final : public Atom {
 public:
  Saio(Box& aBox, AtomType aDefaultType);

  AtomType mAuxInfoType;
  uint32_t mAuxInfoTypeParameter;
  FallibleTArray<uint64_t> mOffsets;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

struct SampleToGroupEntry {
 public:
  static const uint32_t kTrackGroupDescriptionIndexBase = 0;
  static const uint32_t kFragmentGroupDescriptionIndexBase = 0x10000;

  SampleToGroupEntry(uint32_t aSampleCount, uint32_t aGroupDescriptionIndex)
      : mSampleCount(aSampleCount),
        mGroupDescriptionIndex(aGroupDescriptionIndex) {}

  uint32_t mSampleCount;
  uint32_t mGroupDescriptionIndex;
};

class Sbgp final : public Atom  // SampleToGroup box.
{
 public:
  explicit Sbgp(Box& aBox);

  AtomType mGroupingType;
  uint32_t mGroupingTypeParam;
  FallibleTArray<SampleToGroupEntry> mEntries;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

// Stores information form CencSampleEncryptionInformationGroupEntry (seig).
// Cenc here refers to the common encryption standard, rather than the specific
// cenc scheme from that standard. This structure is used for all encryption
// schemes. I.e. it is used for both cenc and cbcs, not just cenc.
struct CencSampleEncryptionInfoEntry final {
 public:
  CencSampleEncryptionInfoEntry() = default;

  Result<Ok, nsresult> Init(BoxReader& aReader);

  bool mIsEncrypted = false;
  uint8_t mIVSize = 0;
  CopyableTArray<uint8_t> mKeyId;
  uint8_t mCryptByteBlock = 0;
  uint8_t mSkipByteBlock = 0;
  CopyableTArray<uint8_t> mConsantIV;
};

class Sgpd final : public Atom  // SampleGroupDescription box.
{
 public:
  explicit Sgpd(Box& aBox);

  AtomType mGroupingType;
  FallibleTArray<CencSampleEncryptionInfoEntry> mEntries;

 protected:
  Result<Ok, nsresult> Parse(Box& aBox);
};

// Audio/video entries from the sample description box (stsd). We only need to
// store if these are encrypted, so do not need a specialized class for
// different audio and video data. Currently most of the parsing of these
// entries is by the mp4parse-rust, but moof pasrser needs to know which of
// these are encrypted when parsing the track fragment header (tfhd).
struct SampleDescriptionEntry {
  bool mIsEncryptedEntry = false;
};

// Used to indicate in variants if all tracks should be parsed.
struct ParseAllTracks {};

typedef Variant<ParseAllTracks, uint32_t> TrackParseMode;

class Moof final : public Atom {
 public:
  Moof(Box& aBox, const TrackParseMode& aTrackParseMode, Trex& aTrex,
       Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf,
       uint64_t* aDecodeTime, bool aIsAudio,
       nsTArray<TrackEndCts>& aTracksEndCts);
  bool GetAuxInfo(AtomType aType, FallibleTArray<MediaByteRange>* aByteRanges);
  void FixRounding(const Moof& aMoof);

  mozilla::MediaByteRange mRange;
  mozilla::MediaByteRange mMdatRange;
  MP4Interval<Microseconds> mTimeRange;
  FallibleTArray<Sample> mIndex;

  FallibleTArray<CencSampleEncryptionInfoEntry>
      mFragmentSampleEncryptionInfoEntries;
  FallibleTArray<SampleToGroupEntry> mFragmentSampleToGroupEntries;

  Tfhd mTfhd;
  FallibleTArray<Saiz> mSaizs;
  FallibleTArray<Saio> mSaios;
  nsTArray<nsTArray<uint8_t>> mPsshes;

 private:
  // aDecodeTime is updated to the end of the parsed TRAF on return.
  void ParseTraf(Box& aBox, const TrackParseMode& aTrackParseMode, Trex& aTrex,
                 Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts, Sinf& aSinf,
                 uint64_t* aDecodeTime, bool aIsAudio);
  // aDecodeTime is updated to the end of the parsed TRUN on return.
  Result<Ok, nsresult> ParseTrun(Box& aBox, Mvhd& aMvhd, Mdhd& aMdhd,
                                 Edts& aEdts, uint64_t* aDecodeTime,
                                 bool aIsAudio);
  // Process the sample auxiliary information used by common encryption.
  // aScheme is used to select the appropriate auxiliary information and should
  // be set based on the encryption scheme used by the track being processed.
  // Note, the term cenc here refers to the standard, not the specific scheme
  // from that standard. I.e. this function is used to handle up auxiliary
  // information from the cenc and cbcs schemes.
  bool ProcessCencAuxInfo(AtomType aScheme);
  uint64_t mMaxRoundingError;
};

DDLoggedTypeDeclName(MoofParser);

class MoofParser : public DecoderDoctorLifeLogger<MoofParser> {
 public:
  MoofParser(ByteStream* aSource, const TrackParseMode& aTrackParseMode,
             bool aIsAudio)
      : mSource(aSource),
        mOffset(0),
        mTrex(aTrackParseMode.is<uint32_t>() ? aTrackParseMode.as<uint32_t>()
                                             : 0),
        mIsAudio(aIsAudio),
        mLastDecodeTime(0),
        mTrackParseMode(aTrackParseMode) {
    // Setting mIsMultitrackParser is a nasty work around for calculating
    // the composition range for MSE that causes the parser to parse multiple
    // tracks. Ideally we'd store an array of tracks with different metadata
    // for each.
    DDLINKCHILD("source", aSource);
  }
  bool RebuildFragmentedIndex(const mozilla::MediaByteRangeSet& aByteRanges);
  // If *aCanEvict is set to true. then will remove all moofs already parsed
  // from index then rebuild the index. *aCanEvict is set to true upon return if
  // some moofs were removed.
  bool RebuildFragmentedIndex(const mozilla::MediaByteRangeSet& aByteRanges,
                              bool* aCanEvict);
  bool RebuildFragmentedIndex(BoxContext& aContext);
  MP4Interval<Microseconds> GetCompositionRange(
      const mozilla::MediaByteRangeSet& aByteRanges);
  bool ReachedEnd();
  void ParseMoov(Box& aBox);
  void ParseTrak(Box& aBox);
  void ParseMdia(Box& aBox);
  void ParseMvex(Box& aBox);

  void ParseMinf(Box& aBox);
  void ParseStbl(Box& aBox);
  void ParseStsd(Box& aBox);
  void ParseEncrypted(Box& aBox);

  bool BlockingReadNextMoof();

  already_AddRefed<mozilla::MediaByteBuffer> Metadata();
  MediaByteRange FirstCompleteMediaSegment();
  MediaByteRange FirstCompleteMediaHeader();

  mozilla::MediaByteRange mInitRange;
  RefPtr<ByteStream> mSource;
  uint64_t mOffset;
  Mvhd mMvhd;
  Mdhd mMdhd;
  Trex mTrex;
  Tfdt mTfdt;
  Edts mEdts;
  Sinf mSinf;

  FallibleTArray<CencSampleEncryptionInfoEntry>
      mTrackSampleEncryptionInfoEntries;
  FallibleTArray<SampleToGroupEntry> mTrackSampleToGroupEntries;
  FallibleTArray<SampleDescriptionEntry> mSampleDescriptions;

  nsTArray<Moof>& Moofs() { return mMoofs; }

 private:
  void ScanForMetadata(mozilla::MediaByteRange& aMoov);
  nsTArray<Moof> mMoofs;
  nsTArray<MediaByteRange> mMediaRanges;
  nsTArray<TrackEndCts> mTracksEndCts;
  bool mIsAudio;
  uint64_t mLastDecodeTime;
  // Either a ParseAllTracks if in multitrack mode, or an integer representing
  // the track_id for the track being parsed. If parsing a specific track, mTrex
  // should have an id matching mTrackParseMode.as<uint32_t>(). In this case 0
  // is a valid track id -- this is not allowed in the spec, but such mp4s
  // appear in the wild. In the ParseAllTracks case, mTrex can have an arbitrary
  // id based on the tracks being parsed.
  const TrackParseMode mTrackParseMode;
};
}  // namespace mozilla

#endif