diff --git a/lib/dash/dash_parser.js b/lib/dash/dash_parser.js index 431419160..b298d828c 100644 --- a/lib/dash/dash_parser.js +++ b/lib/dash/dash_parser.js @@ -423,7 +423,7 @@ shaka.dash.DashParser.prototype.parseManifest_ = var Functional = shaka.util.Functional; var XmlUtils = shaka.util.XmlUtils; - var string = shaka.util.StringUtils.fromBytesAutoDetect(data); + var string = shaka.util.StringUtils.fromUTF8(data); var parser = new DOMParser(); var xml = null; var mpd = null; @@ -1174,7 +1174,7 @@ shaka.dash.DashParser.prototype.requestForTiming_ = text = response.headers['date']; } else { - text = shaka.util.StringUtils.fromBytesAutoDetect(response.data); + text = shaka.util.StringUtils.fromUTF8(response.data); } var date = Date.parse(text); diff --git a/lib/media/mp4_vtt_parser.js b/lib/media/mp4_vtt_parser.js index 11155edb7..6f70bc6b1 100644 --- a/lib/media/mp4_vtt_parser.js +++ b/lib/media/mp4_vtt_parser.js @@ -121,7 +121,7 @@ shaka.media.Mp4VttParser.parseCue_ = function( var startPosition = reader.getPosition(); var size = reader.readUint32(); var type = reader.readUint32(); - var content = shaka.util.StringUtils.fromBytesAutoDetect( + var content = shaka.util.StringUtils.fromUTF8( reader.readBytes(size - 8).buffer); if (size == 1) { size = reader.readUint64(); diff --git a/lib/media/ttml_text_parser.js b/lib/media/ttml_text_parser.js index c9a261ea9..3dd62cfc2 100644 --- a/lib/media/ttml_text_parser.js +++ b/lib/media/ttml_text_parser.js @@ -33,7 +33,7 @@ goog.require('shaka.util.StringUtils'); * @throws {shaka.util.Error} */ shaka.media.TtmlTextParser = function(data, segmentStartTime, segmentEndTime) { - var str = shaka.util.StringUtils.fromBytesAutoDetect(data); + var str = shaka.util.StringUtils.fromUTF8(data); var ret = []; var parser = new DOMParser(); var xml = null; diff --git a/lib/media/vtt_text_parser.js b/lib/media/vtt_text_parser.js index 197c8f8b7..f03a4f607 100644 --- a/lib/media/vtt_text_parser.js +++ b/lib/media/vtt_text_parser.js @@ -34,7 +34,7 @@ goog.require('shaka.util.TextParser'); */ shaka.media.VttTextParser = function(data, segmentStartTime, segmentEndTime) { // Get the input as a string. Normalize newlines to \n. - var str = shaka.util.StringUtils.fromBytesAutoDetect(data); + var str = shaka.util.StringUtils.fromUTF8(data); str = str.replace(/\r\n|\r(?=[^\n]|$)/gm, '\n'); var blocks = str.split(/\n{2,}/m); diff --git a/lib/util/data_view_reader.js b/lib/util/data_view_reader.js index 1ff68408f..8b0b4b2d6 100644 --- a/lib/util/data_view_reader.js +++ b/lib/util/data_view_reader.js @@ -197,7 +197,8 @@ shaka.util.DataViewReader.prototype.skip = function(bytes) { /** - * Keeps reading until it reaches a byte that equals to zero. + * Keeps reading until it reaches a byte that equals to zero. The text is + * assumed to be UTF-8. * @return {string} * @throws {shaka.util.Error} when reading past the end of the data view. */ @@ -216,7 +217,7 @@ shaka.util.DataViewReader.prototype.readTerminatedString = function() { var ret = this.dataView_.buffer.slice(start, this.position_); // skip string termination this.position_ += 1; - return shaka.util.StringUtils.fromBytesAutoDetect(ret); + return shaka.util.StringUtils.fromUTF8(ret); }; diff --git a/lib/util/string_utils.js b/lib/util/string_utils.js index 64f09bee6..5c1d26e79 100644 --- a/lib/util/string_utils.js +++ b/lib/util/string_utils.js @@ -36,8 +36,15 @@ goog.require('shaka.util.Error'); */ shaka.util.StringUtils.fromUTF8 = function(data) { if (!data) return ''; + + var uint8 = new Uint8Array(data); + // If present, strip off the UTF-8 BOM. + if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) { + uint8 = uint8.subarray(3); + } + // http://stackoverflow.com/a/13691499 - var utf8 = shaka.util.StringUtils.fromCharCode_(new Uint8Array(data)); + var utf8 = shaka.util.StringUtils.fromCharCode_(uint8); // This converts each character in the string to an escape sequence. If the // character is in the ASCII range, it is not converted; otherwise it is // converted to a URI escape sequence. @@ -109,7 +116,7 @@ shaka.util.StringUtils.fromBytesAutoDetect = function(data) { var uint8 = new Uint8Array(data); if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) - return StringUtils.fromUTF8(uint8.subarray(3)); + return StringUtils.fromUTF8(uint8); else if (uint8[0] == 0xfe && uint8[1] == 0xff) return StringUtils.fromUTF16(uint8.subarray(2), false /* littleEndian */); else if (uint8[0] == 0xff && uint8[1] == 0xfe) diff --git a/test/util/string_utils_unit.js b/test/util/string_utils_unit.js index 3ddc6e87c..2834a6fb0 100644 --- a/test/util/string_utils_unit.js +++ b/test/util/string_utils_unit.js @@ -30,6 +30,14 @@ describe('StringUtils', function() { expect(StringUtils.fromUTF8(buffer)).toBe('F\u20ac \ud800\udf48'); }); + it('strips the BOM in fromUTF8', function() { + // This is 4 Unicode characters, the last will be split into a surrogate + // pair. + var arr = [0xef, 0xbb, 0xbf, 0x74, 0x65, 0x78, 0x74]; + var buffer = new Uint8Array(arr).buffer; + expect(StringUtils.fromUTF8(buffer)).toBe('text'); + }); + it('parses fromUTF16 big-endian', function() { // This is big-endian pairs of 16-bit numbers. This translates into 3 // Unicode characters where the last is split into a surrogate pair.