shaka-player/lib/cea/mp4_cea_parser.js

/*! @license
 * Shaka Player
 * Copyright 2016 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

goog.provide('shaka.cea.Mp4CeaParser');

goog.require('goog.asserts');
goog.require('shaka.cea.CeaUtils');
goog.require('shaka.cea.SeiProcessor');
goog.require('shaka.log');
goog.require('shaka.media.ClosedCaptionParser');
goog.require('shaka.util.DataViewReader');
goog.require('shaka.util.Error');
goog.require('shaka.util.Mp4Parser');
goog.require('shaka.util.Mp4BoxParsers');

/**
 * MPEG4 stream parser used for extracting 708 closed captions data.
 * @implements {shaka.extern.ICeaParser}
 * @export
 */
shaka.cea.Mp4CeaParser = class {
  constructor() {
    /**
     * SEI data processor.
     * @private
     * @const {!shaka.cea.SeiProcessor}
     */
    this.seiProcessor_ = new shaka.cea.SeiProcessor();

    /**
     * Map of track id to corresponding timescale.
     * @private {!Map<number, number>}
     */
    this.trackIdToTimescale_ = new Map();

    /**
     * Default sample duration, as specified by the TREX box.
     * @private {number}
     */
    this.defaultSampleDuration_ = 0;

    /**
     * Default sample size, as specified by the TREX box.
     * @private {number}
     */
    this.defaultSampleSize_ = 0;

    /**
     * @private {shaka.cea.Mp4CeaParser.BitstreamFormat}
     */
    this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.BitstreamFormat.UNKNOWN;
  }

  /**
   * Parses the init segment. Gets Default Sample Duration and Size from the
   * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box
   * contains a track header (TKHD) containing track ID, and a media header box
   * (MDHD) containing the timescale for the track
   * @override
   */
  init(initSegment) {
    const Mp4Parser = shaka.util.Mp4Parser;
    const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
    const trackIds = [];
    const timescales = [];

    const codecBoxParser = (box) => this.setBitstreamFormat_(box.name);

    new Mp4Parser()
        .box('moov', Mp4Parser.children)
        .box('mvex', Mp4Parser.children)
        .fullBox('trex', (box) => {
          const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX(
              box.reader);

          this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration;
          this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize;
        })
        .box('trak', Mp4Parser.children)
        .fullBox('tkhd', (box) => {
          goog.asserts.assert(
              box.version != null,
              'TKHD is a full box and should have a valid version.');
          const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD(
              box.reader, box.version);
          trackIds.push(parsedTKHDBox.trackId);
        })
        .box('mdia', Mp4Parser.children)
        .fullBox('mdhd', (box) => {
          goog.asserts.assert(
              box.version != null,
              'MDHD is a full box and should have a valid version.');
          const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD(
              box.reader, box.version);
          timescales.push(parsedMDHDBox.timescale);
        })
        .box('minf', Mp4Parser.children)
        .box('stbl', Mp4Parser.children)
        .fullBox('stsd', Mp4Parser.sampleDescription)

        // These are the various boxes that signal a codec.
        .box('avc1', codecBoxParser)
        .box('avc3', codecBoxParser)
        .box('dvav', codecBoxParser)
        .box('dva1', codecBoxParser)
        .box('hev1', codecBoxParser)
        .box('hvc1', codecBoxParser)
        .box('dvh1', codecBoxParser)
        .box('dvhe', codecBoxParser)
        .box('vvc1', codecBoxParser)
        .box('vvi1', codecBoxParser)
        .box('dvc1', codecBoxParser)
        .box('dvi1', codecBoxParser)

        // This signals an encrypted sample, which we can go inside of to find
        // the codec used.
        .box('encv', Mp4Parser.visualSampleEntry)
        .box('sinf', Mp4Parser.children)
        .box('frma', (box) => {
          const {codec} = shaka.util.Mp4BoxParsers.parseFRMA(box.reader);
          this.setBitstreamFormat_(codec);
        })

        .parse(initSegment, /* partialOkay= */ true, /* stopOnPartial= */ true);

    // At least one track should exist, and each track should have a
    // corresponding Id in TKHD box, and timescale in its MDHD box
    if (!trackIds.length|| !timescales.length ||
      trackIds.length != timescales.length) {
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL,
          shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.INVALID_MP4_CEA);
    }

    if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
      shaka.log.alwaysWarn(
          'Unable to determine bitstream format for CEA parsing!');
    }

    // Populate the map from track Id to timescale
    trackIds.forEach((trackId, idx) => {
      this.trackIdToTimescale_.set(trackId, timescales[idx]);
    });
  }

  /**
   * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in
   * pairs. The following logic gets the necessary info from MOOFs to parse
   * MDATs (base media decode time, sample sizes/offsets/durations, etc),
   * and then parses the MDAT boxes for CEA-708 packets using this information.
   * CEA-708 packets are returned in the callback.
   * @override
   */
  parse(mediaSegment) {
    const Mp4Parser = shaka.util.Mp4Parser;
    const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;

    if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
      // We don't know how to extract SEI from this.
      return [];
    }

    /** @type {!Array<!shaka.extern.ICeaParser.CaptionPacket>} **/
    const captionPackets = [];

    let moofOffset = 0;

    /** @type {!Array<!shaka.cea.Mp4CeaParser.ParsedTRAF>} */
    let parsedTRAFs = [];

    new Mp4Parser()
        .box('moof', (box) => {
          moofOffset = box.start;
          // traf box parsing is reset on each moof.
          parsedTRAFs = [];
          Mp4Parser.children(box);
        })
        .box('traf', (box) => {
          parsedTRAFs.push({
            baseMediaDecodeTime: null,
            defaultSampleDuration: this.defaultSampleDuration_,
            defaultSampleSize: this.defaultSampleSize_,
            parsedTRUNs: [],
            timescale: shaka.cea.CeaUtils.DEFAULT_TIMESCALE_VALUE,
          });
          Mp4Parser.children(box);
        })
        .fullBox('trun', (box) => {
          goog.asserts.assert(
              box.version != null && box.flags != null,
              'TRUN is a full box and should have a valid version & flags.');

          const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];

          const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN(
              box.reader, box.version, box.flags);
          lastTRAF.parsedTRUNs.push(parsedTRUN);
        })
        .fullBox('tfhd', (box) => {
          goog.asserts.assert(
              box.flags != null,
              'TFHD is a full box and should have valid flags.');

          const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];

          const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD(
              box.reader, box.flags);

          // If specified, defaultSampleDuration and defaultSampleSize
          // override the ones specified in the TREX box
          lastTRAF.defaultSampleDuration = parsedTFHD.defaultSampleDuration ||
              this.defaultSampleDuration_;

          lastTRAF.defaultSampleSize = parsedTFHD.defaultSampleSize ||
              this.defaultSampleSize_;

          const trackId = parsedTFHD.trackId;

          // Get the timescale from the track Id
          if (this.trackIdToTimescale_.has(trackId)) {
            lastTRAF.timescale = this.trackIdToTimescale_.get(trackId);
          }
        })
        .fullBox('tfdt', (box) => {
          goog.asserts.assert(
              box.version != null,
              'TFDT is a full box and should have a valid version.');

          const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];

          const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDTInaccurate(
              box.reader, box.version);

          lastTRAF.baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime;
        })
        .box('mdat', (box) => {
          const offset = moofOffset - box.start - 8;
          const initialPosition = box.reader.getPosition();
          for (const parsedTRAF of parsedTRAFs) {
            if (parsedTRAF.baseMediaDecodeTime === null) {
              // This field should have been populated by the Base Media Decode
              // Time in the tfdt box.
              shaka.log.alwaysWarn(
                  'Unable to find base media decode time for CEA captions!');
              continue;
            }
            box.reader.seek(initialPosition);
            this.parseMdat_(box.reader,
                parsedTRAF.baseMediaDecodeTime,
                parsedTRAF.timescale,
                parsedTRAF.defaultSampleDuration,
                parsedTRAF.defaultSampleSize,
                offset,
                parsedTRAF.parsedTRUNs,
                captionPackets);
          }
        })
        .parse(mediaSegment, /* partialOkay= */ false,
            /* stopOnPartial= */ true);

    return captionPackets;
  }

  /**
   * Parse MDAT box.
   * @param {!shaka.util.DataViewReader} reader
   * @param {number} time
   * @param {number} timescale
   * @param {number} defaultSampleDuration
   * @param {number} defaultSampleSize
   * @param {number} offset
   * @param {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
   * @param {!Array<!shaka.extern.ICeaParser.CaptionPacket>} captionPackets
   * @private
   */
  parseMdat_(reader, time, timescale, defaultSampleDuration,
      defaultSampleSize, offset, parsedTRUNs, captionPackets) {
    const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
    const CeaUtils = shaka.cea.CeaUtils;
    let sampleIndex = 0;

    // The fields in each ParsedTRUNSample contained in the sampleData
    // array are nullable. In the case of sample data and sample duration,
    // we use the defaults provided by the TREX/TFHD boxes. For sample
    // composition time offset, we default to 0.
    let sampleSize = defaultSampleSize;

    // Combine all sample data.  This assumes that the samples described across
    // multiple trun boxes are still continuous in the mdat box.
    const sampleDatas = parsedTRUNs.map((t) => t.sampleData);
    const sampleData = [].concat(...sampleDatas);

    if (sampleData.length) {
      sampleSize = sampleData[0].sampleSize || defaultSampleSize;
    }

    reader.skip(offset + parsedTRUNs[0].dataOffset);

    while (reader.hasMoreData()) {
      const naluSize = reader.readUint32();
      const naluHeader = reader.readUint8();
      let naluType = null;
      let isSeiMessage = false;
      let naluHeaderSize = 1;

      goog.asserts.assert(this.bitstreamFormat_ != BitstreamFormat.UNKNOWN,
          'Bitstream format should have been checked before now!');
      switch (this.bitstreamFormat_) {
        case BitstreamFormat.H264:
          naluType = naluHeader & 0x1f;
          isSeiMessage = naluType == CeaUtils.H264_NALU_TYPE_SEI;
          break;

        case BitstreamFormat.H265:
          naluHeaderSize = 2;
          reader.skip(1);
          naluType = (naluHeader >> 1) & 0x3f;
          isSeiMessage =
              naluType == CeaUtils.H265_PREFIX_NALU_TYPE_SEI ||
              naluType == CeaUtils.H265_SUFFIX_NALU_TYPE_SEI;
          break;

        case BitstreamFormat.H266:
          naluHeaderSize = 2;
          reader.skip(1);
          naluType = (naluHeader >> 1) & 0x3f;
          isSeiMessage =
              naluType == CeaUtils.H266_PREFIX_NALU_TYPE_SEI ||
              naluType == CeaUtils.H266_SUFFIX_NALU_TYPE_SEI;
          break;

        default:
          return;
      }

      if (isSeiMessage) {
        let timeOffset = 0;

        if (sampleIndex < sampleData.length) {
          timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0;
        }

        const pts = (time + timeOffset) / timescale;
        const nalu = reader.readBytes(
            naluSize - naluHeaderSize,
            // Don't clone.  The nalu is temporary and is not stored.
            /* clone= */ false);
        for (const packet of this.seiProcessor_.process(nalu)) {
          captionPackets.push({
            packet,
            pts,
          });
        }
      } else {
        try {
          reader.skip(naluSize - naluHeaderSize);
        } catch (e) {
          // It is necessary to ignore this error because it can break the start
          // of playback even if the user does not want to see the subtitles.
          break;
        }
      }
      sampleSize -= (naluSize + 4);
      if (sampleSize == 0) {
        if (sampleIndex < sampleData.length) {
          time += sampleData[sampleIndex].sampleDuration ||
              defaultSampleDuration;
        } else {
          time += defaultSampleDuration;
        }

        sampleIndex++;

        if (sampleIndex < sampleData.length) {
          sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize;
        } else {
          sampleSize = defaultSampleSize;
        }
      }
    }
  }

  /**
   * @param {string} codec A fourcc for a codec.
   * @private
   */
  setBitstreamFormat_(codec) {
    if (shaka.cea.Mp4CeaParser.CodecBitstreamMap_.has(codec)) {
      this.bitstreamFormat_ =
          shaka.cea.Mp4CeaParser.CodecBitstreamMap_.get(codec);
    }
  }
};

/** @enum {number} */
shaka.cea.Mp4CeaParser.BitstreamFormat = {
  UNKNOWN: 0,
  H264: 1,
  H265: 2,
  H266: 3,
};

/** @private {Map<string, shaka.cea.Mp4CeaParser.BitstreamFormat>} */
shaka.cea.Mp4CeaParser.CodecBitstreamMap_ = new Map()
    // AVC
    .set('avc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
    .set('avc3', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
    // Dolby Vision based in AVC
    .set('dvav', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
    .set('dva1', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
    // HEVC
    .set('hev1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
    .set('hvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
    // Dolby Vision based in HEVC
    .set('dvh1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
    .set('dvhe', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
    // VVC
    .set('vvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266)
    .set('vvi1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266)
    // Dolby Vision based in VVC
    .set('dvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266)
    .set('dvi1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266);

/**
 * @typedef {{
 *    baseMediaDecodeTime: ?number,
 *    defaultSampleDuration: number,
 *    defaultSampleSize: number,
 *    parsedTRUNs: !Array<shaka.util.ParsedTRUNBox>,
 *    timescale: number,
 * }}
 *
 * @property {?number} baseMediaDecodeTime
 * @property {number} defaultSampleDuration
 * @property {number} defaultSampleSize
 * @property {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
 * @property {?number} timescale
 */
shaka.cea.Mp4CeaParser.ParsedTRAF;

shaka.media.ClosedCaptionParser.registerParser('video/mp4',
    () => new shaka.cea.Mp4CeaParser());