Browse Source

string_decoder: fix bad utf8 character handling

This commit fixes an issue when extra utf8 continuation bytes appear
at the end of a chunk of data, causing miscalculations to be made
when checking how many bytes are needed to decode a complete
character.

Fixes: https://github.com/nodejs/node/issues/7308
PR-URL: https://github.com/nodejs/node/pull/7310
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
v7.x
Brian White 8 years ago
parent
commit
79ef3b6b5d
No known key found for this signature in database GPG Key ID: 606D7358F94DA209
  1. 75
      lib/string_decoder.js
  2. 2
      test/parallel/test-string-decoder.js

75
lib/string_decoder.js

@ -25,8 +25,10 @@ function StringDecoder(encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
// fall through
nb = 4;
break;
case 'utf8':
this.fillLast = utf8FillLast;
nb = 4;
break;
case 'base64':
@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End;
// Returns only complete characters in a Buffer
StringDecoder.prototype.text = utf8Text;
// Attempts to complete a partial character using bytes from a Buffer
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
@ -92,38 +94,83 @@ function utf8CheckByte(byte) {
return -1;
}
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
// character, returning the total number of bytes needed to complete the partial
// character (if applicable).
// Checks at most 3 bytes at the end of a Buffer in order to detect an
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
// needed to complete the UTF-8 character (if applicable) are returned.
function utf8CheckIncomplete(self, buf, i) {
var j = buf.length - 1;
if (j < i)
return 0;
var nb = utf8CheckByte(buf[j--]);
var nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
self.lastNeed = nb - 1;
return nb;
}
if (j < i)
if (--j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
self.lastNeed = nb - 2;
return nb;
}
if (j < i)
if (--j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
nb = utf8CheckByte(buf[j]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
if (nb > 0) {
if (nb === 2)
nb = 0;
else
self.lastNeed = nb - 3;
}
return nb;
}
return 0;
}
// Validates as many continuation bytes for a multi-byte UTF-8 character as
// needed or are available. If we see a non-continuation byte where we expect
// one, we "replace" the validated continuation bytes we've seen so far with
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
// behavior. The continuation byte check is included three times in the case
// where all of the continuation bytes for a character exist in the same buffer.
// It is also done this way as a slight performance increase instead of using a
// loop.
function utf8CheckExtraBytes(self, buf, p) {
if ((buf[0] & 0xC0) !== 0x80) {
self.lastNeed = 0;
return '\ufffd'.repeat(p);
}
if (self.lastNeed > 1 && buf.length > 1) {
if ((buf[1] & 0xC0) !== 0x80) {
self.lastNeed = 1;
return '\ufffd'.repeat(p + 1);
}
if (self.lastNeed > 2 && buf.length > 2) {
if ((buf[2] & 0xC0) !== 0x80) {
self.lastNeed = 2;
return '\ufffd'.repeat(p + 2);
}
}
}
}
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
function utf8FillLast(buf) {
const p = this.lastTotal - this.lastNeed;
var r = utf8CheckExtraBytes(this, buf, p);
if (r !== undefined)
return r;
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, p, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
}
buf.copy(this.lastChar, p, 0, buf.length);
this.lastNeed -= buf.length;
}
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
// partial character, the character's bytes are buffered until the required
// number of bytes are available.

2
test/parallel/test-string-decoder.js

@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')),
assert.strictEqual(decoder.end(), '');
decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd');
assert.strictEqual(decoder.end(), '\ufffd');

Loading…
Cancel
Save