From 962ac37e1f07f0ce36266a3156dced149b6387d1 Mon Sep 17 00:00:00 2001 From: Brian White Date: Wed, 15 Jun 2016 12:42:52 -0400 Subject: [PATCH] string_decoder: fix bad utf8 character handling This commit fixes an issue when extra utf8 continuation bytes appear at the end of a chunk of data, causing miscalculations to be made when checking how many bytes are needed to decode a complete character. Fixes: https://github.com/nodejs/node/issues/7308 PR-URL: https://github.com/nodejs/node/pull/7310 Reviewed-By: Colin Ihrig Reviewed-By: James M Snell Reviewed-By: Fedor Indutny --- lib/string_decoder.js | 75 ++++++++++++++++++++++------ test/parallel/test-string-decoder.js | 2 +- 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/lib/string_decoder.js b/lib/string_decoder.js index aaadfd8934..2e9b57d681 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -45,8 +45,10 @@ function StringDecoder(encoding) { case 'utf16le': this.text = utf16Text; this.end = utf16End; - // fall through + nb = 4; + break; case 'utf8': + this.fillLast = utf8FillLast; nb = 4; break; case 'base64': @@ -88,7 +90,7 @@ StringDecoder.prototype.end = utf8End; // Returns only complete characters in a Buffer StringDecoder.prototype.text = utf8Text; -// Attempts to complete a partial character using bytes from a Buffer +// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer StringDecoder.prototype.fillLast = function(buf) { if (this.lastNeed <= buf.length) { buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); @@ -112,38 +114,83 @@ function utf8CheckByte(byte) { return -1; } -// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8 -// character, returning the total number of bytes needed to complete the partial -// character (if applicable). +// Checks at most 3 bytes at the end of a Buffer in order to detect an +// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) +// needed to complete the UTF-8 character (if applicable) are returned. function utf8CheckIncomplete(self, buf, i) { var j = buf.length - 1; if (j < i) return 0; - var nb = utf8CheckByte(buf[j--]); + var nb = utf8CheckByte(buf[j]); if (nb >= 0) { if (nb > 0) - self.lastNeed = nb + 1 - (buf.length - j); + self.lastNeed = nb - 1; return nb; } - if (j < i) + if (--j < i) return 0; - nb = utf8CheckByte(buf[j--]); + nb = utf8CheckByte(buf[j]); if (nb >= 0) { if (nb > 0) - self.lastNeed = nb + 1 - (buf.length - j); + self.lastNeed = nb - 2; return nb; } - if (j < i) + if (--j < i) return 0; - nb = utf8CheckByte(buf[j--]); + nb = utf8CheckByte(buf[j]); if (nb >= 0) { - if (nb > 0) - self.lastNeed = nb + 1 - (buf.length - j); + if (nb > 0) { + if (nb === 2) + nb = 0; + else + self.lastNeed = nb - 3; + } return nb; } return 0; } +// Validates as many continuation bytes for a multi-byte UTF-8 character as +// needed or are available. If we see a non-continuation byte where we expect +// one, we "replace" the validated continuation bytes we've seen so far with +// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding +// behavior. The continuation byte check is included three times in the case +// where all of the continuation bytes for a character exist in the same buffer. +// It is also done this way as a slight performance increase instead of using a +// loop. +function utf8CheckExtraBytes(self, buf, p) { + if ((buf[0] & 0xC0) !== 0x80) { + self.lastNeed = 0; + return '\ufffd'.repeat(p); + } + if (self.lastNeed > 1 && buf.length > 1) { + if ((buf[1] & 0xC0) !== 0x80) { + self.lastNeed = 1; + return '\ufffd'.repeat(p + 1); + } + if (self.lastNeed > 2 && buf.length > 2) { + if ((buf[2] & 0xC0) !== 0x80) { + self.lastNeed = 2; + return '\ufffd'.repeat(p + 2); + } + } + } +} + +// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. +function utf8FillLast(buf) { + const p = this.lastTotal - this.lastNeed; + var r = utf8CheckExtraBytes(this, buf, p); + if (r !== undefined) + return r; + if (this.lastNeed <= buf.length) { + buf.copy(this.lastChar, p, 0, this.lastNeed); + return this.lastChar.toString(this.encoding, 0, this.lastTotal); + } + buf.copy(this.lastChar, p, 0, buf.length); + this.lastNeed -= buf.length; +} + // Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a // partial character, the character's bytes are buffered until the required // number of bytes are available. diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 14933c46fc..7f1a47abcc 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')), assert.strictEqual(decoder.end(), ''); decoder = new StringDecoder('utf8'); -assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd'); +assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd'); assert.strictEqual(decoder.end(), '\ufffd');