Fix encoding issues with Chinese subs

In many places we tried to guess the encoding of a piece of text.
This guess fails for Chinese UTF-8 text, and probably text in many
other languages.

However, DASH manifests, TTML files, WebVTT files, and VTTC box
payloads are all specified to be in UTF-8.  Rather than guess and
possibly fail, treat all text in these contexts as UTF-8.

Change-Id: I00c652a9f1dd20855e94abfac84275e41dd9e266
This commit is contained in:
Joey Parrish
2016-08-25 14:09:23 -07:00
parent a81d6c5ce2
commit e4e200388c
7 changed files with 25 additions and 9 deletions
+9 -2
View File
@@ -36,8 +36,15 @@ goog.require('shaka.util.Error');
*/
shaka.util.StringUtils.fromUTF8 = function(data) {
if (!data) return '';
var uint8 = new Uint8Array(data);
// If present, strip off the UTF-8 BOM.
if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
uint8 = uint8.subarray(3);
}
// http://stackoverflow.com/a/13691499
var utf8 = shaka.util.StringUtils.fromCharCode_(new Uint8Array(data));
var utf8 = shaka.util.StringUtils.fromCharCode_(uint8);
// This converts each character in the string to an escape sequence. If the
// character is in the ASCII range, it is not converted; otherwise it is
// converted to a URI escape sequence.
@@ -109,7 +116,7 @@ shaka.util.StringUtils.fromBytesAutoDetect = function(data) {
var uint8 = new Uint8Array(data);
if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf)
return StringUtils.fromUTF8(uint8.subarray(3));
return StringUtils.fromUTF8(uint8);
else if (uint8[0] == 0xfe && uint8[1] == 0xff)
return StringUtils.fromUTF16(uint8.subarray(2), false /* littleEndian */);
else if (uint8[0] == 0xff && uint8[1] == 0xfe)