Browse Source

string_bytes: Guarantee valid utf-8 output

Previously v8's WriteUtf8 function would produce invalid utf-8 output
when encountering unmatched surrogate code units [1]. The new
REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points
with the unicode replacement character.

[1]: JS Strings are defined as arrays of 16 bit unsigned integers. There
is no unicode enforcement, so one can easily end up with invalid unicode
code unit sequences inside a string.
v0.10.29-release
Felix Geisendörfer 11 years ago
committed by Timothy J Fontaine
parent
commit
0da4c67165
  1. 7
      src/node.cc
  2. 2
      src/string_bytes.cc
  3. 2
      src/string_bytes.h
  4. 12
      test/simple/test-buffer.js

7
src/node.cc

@ -176,6 +176,8 @@ static uv_async_t dispatch_debug_messages_async;
// Declared in node_internals.h // Declared in node_internals.h
Isolate* node_isolate = NULL; Isolate* node_isolate = NULL;
int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED |
v8::String::NO_NULL_TERMINATION;
static void Spin(uv_idle_t* handle, int status) { static void Spin(uv_idle_t* handle, int status) {
assert((uv_idle_t*) handle == &tick_spinner); assert((uv_idle_t*) handle == &tick_spinner);
@ -3042,6 +3044,11 @@ static char **copy_argv(int argc, char **argv) {
} }
int Start(int argc, char *argv[]) { int Start(int argc, char *argv[]) {
const char* replaceInvalid = getenv("NODE_INVALID_UTF8");
if (replaceInvalid == NULL)
WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8;
// Hack aroung with the argv pointer. Used for process.title = "blah". // Hack aroung with the argv pointer. Used for process.title = "blah".
argv = uv_setup_args(argc, argv); argv = uv_setup_args(argc, argv);

2
src/string_bytes.cc

@ -199,7 +199,7 @@ size_t StringBytes::Write(char* buf,
break; break;
case UTF8: case UTF8:
len = str->WriteUtf8(buf, buflen, chars_written, flags); len = str->WriteUtf8(buf, buflen, chars_written, WRITE_UTF8_FLAGS);
break; break;
case UCS2: case UCS2:

2
src/string_bytes.h

@ -29,6 +29,8 @@
namespace node { namespace node {
extern int WRITE_UTF8_FLAGS;
using v8::Handle; using v8::Handle;
using v8::Local; using v8::Local;
using v8::String; using v8::String;

12
test/simple/test-buffer.js

@ -791,6 +791,18 @@ assert.equal(buf[3], 0xFF);
assert.equal(buf[3], 0xFF); assert.equal(buf[3], 0xFF);
}); });
// test unmatched surrogates not producing invalid utf8 output
// ef bf bd = utf-8 representation of unicode replacement character
// see https://codereview.chromium.org/121173009/
buf = new Buffer('ab\ud800cd', 'utf8');
assert.equal(buf[0], 0x61);
assert.equal(buf[1], 0x62);
assert.equal(buf[2], 0xef);
assert.equal(buf[3], 0xbf);
assert.equal(buf[4], 0xbd);
assert.equal(buf[5], 0x63);
assert.equal(buf[6], 0x64);
// test for buffer overrun // test for buffer overrun
buf = new Buffer([0, 0, 0, 0, 0]); // length: 5 buf = new Buffer([0, 0, 0, 0, 0]); // length: 5
var sub = buf.slice(0, 4); // length: 4 var sub = buf.slice(0, 4); // length: 4

Loading…
Cancel
Save