Fix encoding issues with Chinese subs

In many places we tried to guess the encoding of a piece of text. This guess fails for Chinese UTF-8 text, and probably text in many other languages. However, DASH manifests, TTML files, WebVTT files, and VTTC box payloads are all specified to be in UTF-8. Rather than guess and possibly fail, treat all text in these contexts as UTF-8. Change-Id: I00c652a9f1dd20855e94abfac84275e41dd9e266
2026-06-16 16:16:40 +03:00 · 2016-08-25 14:09:23 -07:00
parent a81d6c5ce2
commit e4e200388c
7 changed files with 25 additions and 9 deletions
@@ -36,8 +36,15 @@ goog.require('shaka.util.Error');
 */
 shaka.util.StringUtils.fromUTF8 = function(data) {
  if (!data) return '';
+
+  var uint8 = new Uint8Array(data);
+  // If present, strip off the UTF-8 BOM.
+  if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
+    uint8 = uint8.subarray(3);
+  }
+
  // http://stackoverflow.com/a/13691499
-  var utf8 = shaka.util.StringUtils.fromCharCode_(new Uint8Array(data));
+  var utf8 = shaka.util.StringUtils.fromCharCode_(uint8);
  // This converts each character in the string to an escape sequence.  If the
  // character is in the ASCII range, it is not converted; otherwise it is
  // converted to a URI escape sequence.
@@ -109,7 +116,7 @@ shaka.util.StringUtils.fromBytesAutoDetect = function(data) {

  var uint8 = new Uint8Array(data);
  if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf)
-    return StringUtils.fromUTF8(uint8.subarray(3));
+    return StringUtils.fromUTF8(uint8);
  else if (uint8[0] == 0xfe && uint8[1] == 0xff)
    return StringUtils.fromUTF16(uint8.subarray(2), false /* littleEndian */);
  else if (uint8[0] == 0xff && uint8[1] == 0xfe)