shaka-player/lib/util/string_utils.js

/**
 * @license
 * Copyright 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

goog.provide('shaka.util.StringUtils');

goog.require('shaka.log');
goog.require('shaka.util.Error');


/**
 * @namespace shaka.util.StringUtils
 * @summary A set of string utility functions.
 * @exportDoc
 */


/**
 * Creates a string from the given buffer as UTF-8 encoding.
 *
 * @param {?BufferSource} data
 * @return {string}
 * @throws {shaka.util.Error}
 * @export
 */
shaka.util.StringUtils.fromUTF8 = function(data) {
  if (!data) {
    return '';
  }

  let uint8 = new Uint8Array(data);
  // If present, strip off the UTF-8 BOM.
  if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
    uint8 = uint8.subarray(3);
  }

  // http://stackoverflow.com/a/13691499
  const utf8 = shaka.util.StringUtils.fromCharCode(uint8);
  // This converts each character in the string to an escape sequence.  If the
  // character is in the ASCII range, it is not converted; otherwise it is
  // converted to a URI escape sequence.
  // Example: '\x67\x35\xe3\x82\xac' -> 'g#%E3%82%AC'
  const escaped = escape(utf8);
  // Decode the escaped sequence.  This will interpret UTF-8 sequences into the
  // correct character.
  // Example: 'g#%E3%82%AC' -> 'g#€'
  try {
    return decodeURIComponent(escaped);
  } catch (e) {
    throw new shaka.util.Error(
        shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
        shaka.util.Error.Code.BAD_ENCODING);
  }
};


/**
 * Creates a string from the given buffer as UTF-16 encoding.
 *
 * @param {?BufferSource} data
 * @param {boolean} littleEndian true to read little endian, false to read big.
 * @param {boolean=} noThrow true to avoid throwing in cases where we may
 *     expect invalid input.  If noThrow is true and the data has an odd length,
 *     it will be truncated.
 * @return {string}
 * @throws {shaka.util.Error}
 * @export
 */
shaka.util.StringUtils.fromUTF16 = function(data, littleEndian, noThrow) {
  if (!data) {
    return '';
  }

  if (!noThrow && data.byteLength % 2 != 0) {
    shaka.log.error('Data has an incorrect length, must be even.');
    throw new shaka.util.Error(
        shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
        shaka.util.Error.Code.BAD_ENCODING);
  }

  /** @type {ArrayBuffer} */
  let buffer;
  if (data instanceof ArrayBuffer) {
    buffer = data;
  } else {
    // Have to create a new buffer because the argument may be a smaller
    // view on a larger ArrayBuffer.  We cannot use an ArrayBufferView in
    // a DataView.
    const temp = new Uint8Array(data.byteLength);
    temp.set(new Uint8Array(data));
    buffer = temp.buffer;
  }

  // Use a DataView to ensure correct endianness.
  const length = Math.floor(data.byteLength / 2);
  const arr = new Uint16Array(length);
  const dataView = new DataView(buffer);
  for (let i = 0; i < length; i++) {
    arr[i] = dataView.getUint16(i * 2, littleEndian);
  }
  return shaka.util.StringUtils.fromCharCode(arr);
};


/**
 * Creates a string from the given buffer, auto-detecting the encoding that is
 * being used.  If it cannot detect the encoding, it will throw an exception.
 *
 * @param {?BufferSource} data
 * @return {string}
 * @throws {shaka.util.Error}
 * @export
 */
shaka.util.StringUtils.fromBytesAutoDetect = function(data) {
  const StringUtils = shaka.util.StringUtils;

  const uint8 = new Uint8Array(data);
  if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
    return StringUtils.fromUTF8(uint8);
  } else if (uint8[0] == 0xfe && uint8[1] == 0xff) {
    return StringUtils.fromUTF16(uint8.subarray(2), false /* littleEndian */);
  } else if (uint8[0] == 0xff && uint8[1] == 0xfe) {
    return StringUtils.fromUTF16(uint8.subarray(2), true /* littleEndian */);
  }

  const isAscii = (function(arr, i) {
    // arr[i] >= ' ' && arr[i] <= '~';
    return arr.byteLength <= i || (arr[i] >= 0x20 && arr[i] <= 0x7e);
  }.bind(null, uint8));

  shaka.log.debug('Unable to find byte-order-mark, making an educated guess.');
  if (uint8[0] == 0 && uint8[2] == 0) {
    return StringUtils.fromUTF16(data, false /* littleEndian */);
  } else if (uint8[1] == 0 && uint8[3] == 0) {
    return StringUtils.fromUTF16(data, true /* littleEndian */);
  } else if (isAscii(0) && isAscii(1) && isAscii(2) && isAscii(3)) {
    return StringUtils.fromUTF8(data);
  }

  throw new shaka.util.Error(
      shaka.util.Error.Severity.CRITICAL,
      shaka.util.Error.Category.TEXT,
      shaka.util.Error.Code.UNABLE_TO_DETECT_ENCODING);
};


/**
 * Creates a ArrayBuffer from the given string, converting to UTF-8 encoding.
 *
 * @param {string} str
 * @return {!ArrayBuffer}
 * @export
 */
shaka.util.StringUtils.toUTF8 = function(str) {
  // http://stackoverflow.com/a/13691499
  // Converts the given string to a URI encoded string.  If a character falls
  // in the ASCII range, it is not converted; otherwise it will be converted to
  // a series of URI escape sequences according to UTF-8.
  // Example: 'g#€' -> 'g#%E3%82%AC'
  const encoded = encodeURIComponent(str);
  // Convert each escape sequence individually into a character.  Each escape
  // sequence is interpreted as a code-point, so if an escape sequence happens
  // to be part of a multi-byte sequence, each byte will be converted to a
  // single character.
  // Example: 'g#%E3%82%AC' -> '\x67\x35\xe3\x82\xac'
  const utf8 = unescape(encoded);

  const result = new Uint8Array(utf8.length);
  for (let i = 0; i < utf8.length; ++i) {
    result[i] = utf8.charCodeAt(i);
  }
  return result.buffer;
};


/**
 * Creates a ArrayBuffer from the given string, converting to UTF-16 encoding.
 *
 * @param {string} str
 * @param {boolean} littleEndian
 * @return {!ArrayBuffer}
 * @export
 */
shaka.util.StringUtils.toUTF16 = function(str, littleEndian) {
  const result = new Uint8Array(str.length * 2);
  const view = new DataView(result.buffer);
  for (let i = 0; i < str.length; ++i) {
    const value = str.charCodeAt(i);
    view.setUint16(/* position= */ i * 2, value, littleEndian);
  }
  return result.buffer;
};


/**
 * Creates a new string from the given array of char codes.
 *
 * Using String.fromCharCode.apply is risky because you can trigger stack errors
 * on very large arrays.  This breaks up the array into several pieces to avoid
 * this.
 *
 * @param {!TypedArray} array
 * @return {string}
 */
shaka.util.StringUtils.fromCharCode = function(array) {
  const max = 16000;
  let ret = '';
  for (let i = 0; i < array.length; i += max) {
    const subArray = array.subarray(i, i + max);
    ret += String.fromCharCode.apply(null, subArray);
  }

  return ret;
};