shaka-player/lib/util/string_utils.js

/*! @license
 * Shaka Player
 * Copyright 2016 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

goog.provide('shaka.util.StringUtils');

goog.require('goog.asserts');
goog.require('shaka.log');
goog.require('shaka.util.BufferUtils');
goog.require('shaka.util.Error');
goog.require('shaka.util.Lazy');
goog.require('shaka.util.Platform');


/**
 * @namespace shaka.util.StringUtils
 * @summary A set of string utility functions.
 * @export
 */
shaka.util.StringUtils = class {
  /**
   * Creates a string from the given buffer as UTF-8 encoding.
   *
   * @param {?BufferSource} data
   * @return {string}
   * @export
   */
  static fromUTF8(data) {
    if (!data) {
      return '';
    }

    let uint8 = shaka.util.BufferUtils.toUint8(data);
    // If present, strip off the UTF-8 BOM.
    if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
      uint8 = uint8.subarray(3);
    }

    if (window.TextDecoder && !shaka.util.Platform.isPS4()) {
      // Use the TextDecoder interface to decode the text.  This has the
      // advantage compared to the previously-standard decodeUriComponent that
      // it will continue parsing even if it finds an invalid UTF8 character,
      // rather than stop and throw an error.
      const utf8decoder = new TextDecoder();
      const decoded = utf8decoder.decode(uint8);
      if (decoded.includes('\uFFFD')) {
        shaka.log.alwaysError('Decoded string contains an "unknown character' +
                              '" codepoint.  That probably means the UTF8 ' +
                              'encoding was incorrect!');
      }
      return decoded;
    } else {
      // Homebrewed UTF-8 decoder based on
      // https://en.wikipedia.org/wiki/UTF-8#Encoding
      // Unlike decodeURIComponent, won't throw on bad encoding.
      // In this way, it is similar to TextDecoder.

      let decoded = '';
      for (let i = 0; i < uint8.length; ++i) {
        // By default, the "replacement character" codepoint.
        let codePoint = 0xFFFD;

        // Top bit is 0, 1-byte encoding.
        if ((uint8[i] & 0x80) == 0) {
          codePoint = uint8[i];

        // Top 3 bits of byte 0 are 110, top 2 bits of byte 1 are 10,
        // 2-byte encoding.
        } else if (uint8.length >= i + 2 &&
            (uint8[i] & 0xe0) == 0xc0 &&
            (uint8[i + 1] & 0xc0) == 0x80) {
          codePoint = ((uint8[i] & 0x1f) << 6) |
                      ((uint8[i + 1] & 0x3f));
          i += 1;  // Consume one extra byte.

        // Top 4 bits of byte 0 are 1110, top 2 bits of byte 1 and 2 are 10,
        // 3-byte encoding.
        } else if (uint8.length >= i + 3 &&
            (uint8[i] & 0xf0) == 0xe0 &&
            (uint8[i + 1] & 0xc0) == 0x80 &&
            (uint8[i + 2] & 0xc0) == 0x80) {
          codePoint = ((uint8[i] & 0x0f) << 12) |
                      ((uint8[i + 1] & 0x3f) << 6) |
                      ((uint8[i + 2] & 0x3f));
          i += 2;  // Consume two extra bytes.

        // Top 5 bits of byte 0 are 11110, top 2 bits of byte 1, 2 and 3 are 10,
        // 4-byte encoding.
        } else if (uint8.length >= i + 4 &&
            (uint8[i] & 0xf1) == 0xf0 &&
            (uint8[i + 1] & 0xc0) == 0x80 &&
            (uint8[i + 2] & 0xc0) == 0x80 &&
            (uint8[i + 3] & 0xc0) == 0x80) {
          codePoint = ((uint8[i] & 0x07) << 18) |
                      ((uint8[i + 1] & 0x3f) << 12) |
                      ((uint8[i + 2] & 0x3f) << 6) |
                      ((uint8[i + 3] & 0x3f));
          i += 3;  // Consume three extra bytes.
        }

        // JavaScript strings are a series of UTF-16 characters.
        if (codePoint <= 0xffff) {
          decoded += String.fromCharCode(codePoint);
        } else {
          // UTF-16 surrogate-pair encoding, based on
          // https://en.wikipedia.org/wiki/UTF-16#Description
          const baseCodePoint = codePoint - 0x10000;
          const highPart = baseCodePoint >> 10;
          const lowPart = baseCodePoint & 0x3ff;
          decoded += String.fromCharCode(0xd800 + highPart);
          decoded += String.fromCharCode(0xdc00 + lowPart);
        }
      }

      return decoded;
    }
  }


  /**
   * Creates a string from the given buffer as UTF-16 encoding.
   *
   * @param {?BufferSource} data
   * @param {boolean} littleEndian
         true to read little endian, false to read big.
   * @param {boolean=} noThrow true to avoid throwing in cases where we may
   *     expect invalid input.  If noThrow is true and the data has an odd
   *     length,it will be truncated.
   * @return {string}
   * @export
   */
  static fromUTF16(data, littleEndian, noThrow) {
    if (!data) {
      return '';
    }

    if (!noThrow && data.byteLength % 2 != 0) {
      shaka.log.error('Data has an incorrect length, must be even.');
      throw new shaka.util.Error(
          shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
          shaka.util.Error.Code.BAD_ENCODING);
    }

    // Use a DataView to ensure correct endianness.
    const length = Math.floor(data.byteLength / 2);
    const arr = new Uint16Array(length);
    const dataView = shaka.util.BufferUtils.toDataView(data);
    for (let i = 0; i < length; i++) {
      arr[i] = dataView.getUint16(i * 2, littleEndian);
    }
    return shaka.util.StringUtils.fromCharCode(arr);
  }


  /**
   * Creates a string from the given buffer, auto-detecting the encoding that is
   * being used.  If it cannot detect the encoding, it will throw an exception.
   *
   * @param {?BufferSource} data
   * @return {string}
   * @export
   */
  static fromBytesAutoDetect(data) {
    const StringUtils = shaka.util.StringUtils;
    if (!data) {
      return '';
    }

    const uint8 = shaka.util.BufferUtils.toUint8(data);
    if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
      return StringUtils.fromUTF8(uint8);
    } else if (uint8[0] == 0xfe && uint8[1] == 0xff) {
      return StringUtils.fromUTF16(
          uint8.subarray(2), /* littleEndian= */ false);
    } else if (uint8[0] == 0xff && uint8[1] == 0xfe) {
      return StringUtils.fromUTF16(uint8.subarray(2), /* littleEndian= */ true);
    }

    const isAscii = (i) => {
      // arr[i] >= horizontal tab && arr[i] <= '~';
      return uint8.byteLength <= i || (uint8[i] >= 0x09 && uint8[i] <= 0x7e);
    };

    shaka.log.debug(
        'Unable to find byte-order-mark, making an educated guess.');
    if (uint8[0] == 0 && uint8[2] == 0) {
      return StringUtils.fromUTF16(data, /* littleEndian= */ false);
    } else if (uint8[1] == 0 && uint8[3] == 0) {
      return StringUtils.fromUTF16(data, /* littleEndian= */ true);
    } else if (isAscii(0) && isAscii(1) && isAscii(2) && isAscii(3)) {
      return StringUtils.fromUTF8(data);
    }

    throw new shaka.util.Error(
        shaka.util.Error.Severity.CRITICAL,
        shaka.util.Error.Category.TEXT,
        shaka.util.Error.Code.UNABLE_TO_DETECT_ENCODING);
  }


  /**
   * Creates a ArrayBuffer from the given string, converting to UTF-8 encoding.
   *
   * @param {string} str
   * @return {!ArrayBuffer}
   * @export
   */
  static toUTF8(str) {
    if (window.TextEncoder && !shaka.util.Platform.isPS4()) {
      const utf8Encoder = new TextEncoder();
      return shaka.util.BufferUtils.toArrayBuffer(utf8Encoder.encode(str));
    } else {
      // http://stackoverflow.com/a/13691499
      // Converts the given string to a URI encoded string.  If a character
      // falls in the ASCII range, it is not converted; otherwise it will be
      // converted to a series of URI escape sequences according to UTF-8.
      // Example: 'g#€' -> 'g#%E3%82%AC'
      const encoded = encodeURIComponent(str);
      // Convert each escape sequence individually into a character.  Each
      // escape sequence is interpreted as a code-point, so if an escape
      // sequence happens to be part of a multi-byte sequence, each byte will
      // be converted to a single character.
      // Example: 'g#%E3%82%AC' -> '\x67\x35\xe3\x82\xac'
      const utf8 = unescape(encoded);

      const result = new Uint8Array(utf8.length);
      for (let i = 0; i < utf8.length; i++) {
        const item = utf8[i];
        result[i] = item.charCodeAt(0);
      }
      return shaka.util.BufferUtils.toArrayBuffer(result);
    }
  }


  /**
   * Creates a ArrayBuffer from the given string, converting to UTF-16 encoding.
   *
   * @param {string} str
   * @param {boolean} littleEndian
   * @return {!ArrayBuffer}
   * @export
   */
  static toUTF16(str, littleEndian) {
    const result = new ArrayBuffer(str.length * 2);
    const view = new DataView(result);
    for (let i = 0; i < str.length; ++i) {
      const value = str.charCodeAt(i);
      view.setUint16(/* position= */ i * 2, value, littleEndian);
    }
    return result;
  }


  /**
   * Creates a new string from the given array of char codes.
   *
   * Using String.fromCharCode.apply is risky because you can trigger stack
   * errors on very large arrays.  This breaks up the array into several pieces
   * to avoid this.
   *
   * @param {!TypedArray} array
   * @return {string}
   */
  static fromCharCode(array) {
    return shaka.util.StringUtils.fromCharCodeImpl_.value()(array);
  }

  /**
   * Resets the fromCharCode method's implementation.
   * For debug use.
   * @export
   */
  static resetFromCharCode() {
    shaka.util.StringUtils.fromCharCodeImpl_.reset();
  }

  /**
   * This method converts the HTML entities &amp;, &lt;, &gt;, &quot;, &#39;,
   * &nbsp;, &lrm; and &rlm; in string to their corresponding characters.
   *
   * @param {!string} input
   * @return {string}
   */
  static htmlUnescape(input) {
    // Used to map HTML entities to characters.
    const htmlUnescapes = {
      '&amp;': '&',
      '&lt;': '<',
      '&gt;': '>',
      '&quot;': '"',
      '&apos;': '\'',
      '&nbsp;': '\u{a0}',
      '&lrm;': '\u{200e}',
      '&rlm;': '\u{200f}',
    };

    // Used to match HTML entities and HTML characters.
    const reEscapedHtml =
        /&(?:amp|lt|gt|quot|apos|nbsp|lrm|rlm|#[xX]?[0-9a-fA-F]+);/g;
    const reHasEscapedHtml = RegExp(reEscapedHtml.source);
    // This check is an optimization, since replace always makes a copy
    if (input && reHasEscapedHtml.test(input)) {
      return input.replace(reEscapedHtml, (entity) => {
        if (entity[1] == '#') {
          // Translate this into an HTML character.
          let code = 0;
          if (entity[2] == 'x' || entity[2] == 'X') {
            // It's hex.
            code = parseInt(entity.substring(3), 16);
          } else {
            // It's decimal.
            code = parseInt(entity.substring(2), 10);
          }
          // Ignore it if it's an invalid code point.
          if (code >= 0 && code <= 0x10FFFF) {
            return String.fromCodePoint(code);
          } else {
            return '';
          }
        }
        // The only thing that might not match the dictionary above is the
        // single quote, which can be matched by many strings in the regex, but
        // only has a single entry in the dictionary.
        return htmlUnescapes[entity] || '\'';
      });
    }
    return input || '';
  }
};


/** @private {!shaka.util.Lazy.<function(!TypedArray):string>} */
shaka.util.StringUtils.fromCharCodeImpl_ = new shaka.util.Lazy(() => {
  /**
   * @param {number} size
   * @return {boolean}
   */
  const supportsChunkSize = (size) => {
    try {
      // The compiler will complain about suspicious value if this isn't
      // stored in a variable and used.
      const buffer = new Uint8Array(size);

      // This can't use the spread operator, or it blows up on Xbox One.
      // So we use apply() instead, which is normally not allowed.
      // See issue #2186 for more details.
      const foo = String.fromCharCode.apply(null, buffer);
      goog.asserts.assert(foo, 'Should get value');
      return foo.length > 0; // Actually use "foo", so it's not compiled out.
    } catch (error) {
      return false;
    }
  };

  // Different browsers support different chunk sizes; find out the largest
  // this browser supports so we can use larger chunks on supported browsers
  // but still support lower-end devices that require small chunks.
  // 64k is supported on all major desktop browsers.
  for (let size = 64 * 1024; size > 0; size /= 2) {
    if (supportsChunkSize(size)) {
      return (buffer) => {
        let ret = '';
        for (let i = 0; i < buffer.length; i += size) {
          const subArray = buffer.subarray(i, i + size);

          // This can't use the spread operator, or it blows up on Xbox One.
          // So we use apply() instead, which is normally not allowed.
          // See issue #2186 for more details.
          ret += String.fromCharCode.apply(null, subArray);  // Issue #2186
        }
        return ret;
      };
    }
  }
  goog.asserts.assert(false, 'Unable to create a fromCharCode method');
  return null;
});