diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 6e730c2fbd..879e590647 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -21,22 +21,32 @@ var StringDecoder = exports.StringDecoder = function(encoding) { this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, ''); - if (this.encoding === 'utf8') { - this.charBuffer = new Buffer(6); - this.charReceived = 0; - this.charLength = 0; + switch (this.encoding) { + case 'utf8': + // CESU-8 represents each of Surrogate Pair by 3-bytes + this.surrogateSize = 3; + break; + case 'ucs2': + case 'utf16le': + // UTF-16 represents each of Surrogate Pair by 2-bytes + this.surrogateSize = 2; + this.detectIncompleteChar = utf16DetectIncompleteChar; + break; + default: + this.write = passThroughWrite; + return; } + + this.charBuffer = new Buffer(6); + this.charReceived = 0; + this.charLength = 0; }; StringDecoder.prototype.write = function(buffer) { - // If not utf8... - if (this.encoding !== 'utf8') { - return buffer.toString(this.encoding); - } - var charStr = ''; var offset = 0; + // if our last write ended with an incomplete multibyte character while (this.charLength) { // determine how many remaining bytes this buffer has to offer for this char @@ -55,16 +65,14 @@ StringDecoder.prototype.write = function(buffer) { } // get the character that was split - charStr = this.charBuffer.slice(0, this.charLength).toString(); + charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding); // lead surrogate (D800-DBFF) is also the incomplete character - if (this.charLength === 3) { - var charCode = charStr.charCodeAt(0); - if (charCode >= 0xD800 && charCode <= 0xDBFF) { - charStr = ''; - this.charLength += 3; // size of trail surrogate (DC00-DFFF) - continue; - } + var charCode = charStr.charCodeAt(charStr.length - 1); + if (charCode >= 0xD800 && charCode <= 0xDBFF) { + this.charLength += this.surrogateSize; + charStr = ''; + continue; } this.charReceived = this.charLength = 0; @@ -76,7 +84,35 @@ StringDecoder.prototype.write = function(buffer) { break; } + var lenIncomplete = this.detectIncompleteChar(buffer); + + var end = buffer.length; + if (this.charLength) { + // buffer the incomplete character bytes we got + buffer.copy(this.charBuffer, 0, buffer.length - lenIncomplete, end); + this.charReceived = lenIncomplete; + end -= lenIncomplete; + } + + charStr += buffer.toString(this.encoding, 0, end); + + var end = charStr.length - 1; + var charCode = charStr.charCodeAt(end); + // lead surrogate (D800-DBFF) is also the incomplete character + if (charCode >= 0xD800 && charCode <= 0xDBFF) { + var size = this.surrogateSize; + this.charLength += size; + this.charReceived += size; + this.charBuffer.copy(this.charBuffer, size, 0, size); + this.charBuffer.write(charStr.charAt(charStr.length - 1), this.encoding); + return charStr.substring(0, end); + } + + // or just emit the charStr + return charStr; +}; +StringDecoder.prototype.detectIncompleteChar = function(buffer) { // determine how many bytes we have to check at the end of this buffer var i = (buffer.length >= 3) ? 3 : buffer.length; @@ -106,28 +142,15 @@ StringDecoder.prototype.write = function(buffer) { } } - var end = buffer.length; - if (this.charLength) { - // buffer the incomplete character bytes we got - buffer.copy(this.charBuffer, 0, buffer.length - i, buffer.length); - this.charReceived = i; - end -= i; - } - - charStr += buffer.toString('utf8', 0, end); + return i; +}; - // lead surrogate (D800-DBFF) is also the incomplete character - end = charStr.length - 1; - var charCode = charStr.charCodeAt(end); - if (charCode >= 0xD800 && charCode <= 0xDBFF) { - // CESU-8 represents each of Surrogate Pair by 3-bytes - this.charLength += 3 - this.charReceived += 3 - this.charBuffer.copy(this.charBuffer, 3, 0, 3); - this.charBuffer.write(charStr.charAt(end)); - return charStr.substring(0, end); - } +function passThroughWrite(buffer) { + return buffer.toString(this.encoding); +} - // or just emit the charStr - return charStr; -}; +function utf16DetectIncompleteChar(buffer) { + var incomplete = this.charReceived = buffer.length % 2; + this.charLength = incomplete ? 2 : 0; + return incomplete; +} diff --git a/test/simple/test-string-decoder.js b/test/simple/test-string-decoder.js index 3f9dfff487..0da66f6760 100644 --- a/test/simple/test-string-decoder.js +++ b/test/simple/test-string-decoder.js @@ -89,6 +89,42 @@ s += decoder.write(buffer.slice(0, 6)); assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16) +// UCS-2 +decoder = new StringDecoder('ucs2'); +buffer = new Buffer('ab', 'ucs2'); +assert.equal(decoder.write(buffer), 'ab'); // 2 complete chars +buffer = new Buffer('abc', 'ucs2'); +assert.equal(decoder.write(buffer.slice(0, 3)), 'a'); // 'a' and first of 'b' +assert.equal(decoder.write(buffer.slice(3, 6)), 'bc'); // second of 'b' and 'c' + + +// UTF-16LE +buffer = new Buffer('3DD84DDC', 'hex'); // THUMBS UP SIGN (in CESU-8) +var s = ''; +s += decoder.write(buffer.slice(0, 1)); +s += decoder.write(buffer.slice(1, 2)); // complete lead surrogate +assert.equal(s, ''); +s += decoder.write(buffer.slice(2, 3)); +s += decoder.write(buffer.slice(3, 4)); // complete trail surrogate +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16) + +var s = ''; +s += decoder.write(buffer.slice(0, 2)); // complete lead surrogate +assert.equal(s, ''); +s += decoder.write(buffer.slice(2, 4)); // complete trail surrogate +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16) + +var s = ''; +s += decoder.write(buffer.slice(0, 3)); // complete lead surrogate +assert.equal(s, ''); +s += decoder.write(buffer.slice(3, 4)); // complete trail surrogate +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16) + +var s = ''; +s += decoder.write(buffer.slice(0, 4)); +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16) + + // A mixed ascii and non-ascii string // Test stolen from deps/v8/test/cctest/test-strings.cc // U+02E4 -> CB A4