From caba9c70c34c1f517ae2fe20a7425cc20c0800fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Mon, 17 May 2010 22:25:51 -0400 Subject: [PATCH] Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. --- lib/utf8decoder.js | 89 ++++++++++++++++++++++++++++++++ src/node.cc | 1 + test/simple/test-utf8-decoder.js | 71 +++++++++++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 lib/utf8decoder.js create mode 100644 test/simple/test-utf8-decoder.js diff --git a/lib/utf8decoder.js b/lib/utf8decoder.js new file mode 100644 index 0000000000..cd2bb9672e --- /dev/null +++ b/lib/utf8decoder.js @@ -0,0 +1,89 @@ +var Buffer = require('buffer').Buffer; + +var Utf8Decoder = exports.Utf8Decoder = function() { + this.charBuffer = new Buffer(4); + this.charReceived = 0; + this.charLength = 0; +}; + +Utf8Decoder.prototype.write = function(buffer) { + var charStr = ''; + // if our last write ended with an incomplete multibyte character + if (this.charLength) { + // determine how many remaining bytes this buffer has to offer for this char + var i = (buffer.length >= this.charLength - this.charReceived) + ? this.charLength - this.charReceived + : buffer.length; + + // add the new bytes to the char buffer + buffer.copy(this.charBuffer, this.charReceived, 0, i); + this.charReceived += i; + + if (this.charReceived < this.charLength) { + // still not enough chars in this buffer? wait for more ... + return; + } + + // get the character that was split + charStr = this.charBuffer.slice(0, this.charLength).toString(); + this.charReceived = this.charLength = 0; + + if (i == buffer.length) { + // if there are no more bytes in this buffer, just emit our char + this.onString(charStr) + return; + } + + // otherwise cut of the characters end from the beginning of this buffer + buffer = buffer.slice(i, buffer.length); + } + + + // determine how many bytes we have to check at the end of this buffer + var i = (buffer.length >= 3) + ? 3 + : buffer.length; + + // figure out if one of the last i bytes of our buffer announces an incomplete char + for (; i > 0; i--) { + c = buffer[buffer.length - i]; + + // See http://en.wikipedia.org/wiki/UTF-8#Description + + // 110XXXXX + if (i == 1 && c >> 5 == 0x06) { + this.charLength = 2; + break; + } + + // 1110XXXX + if (i <= 2 && c >> 4 == 0x0E) { + this.charLength = 3; + break; + } + + // 11110XXX + if (i <= 3 && c >> 3 == 0x1E) { + this.charLength = 4; + break; + } + } + + if (!this.charLength) { + // no incomplete char at the end of this buffer, emit the whole thing + this.onString(charStr+buffer.toString()); + return; + } + + // buffer the incomplete character bytes we got + buffer.copy(this.charBuffer, 0, buffer.length - i, buffer.length); + this.charReceived = i; + + if (buffer.length - i > 0) { + // buffer had more bytes before the incomplete char, emit them + this.onString(charStr+buffer.slice(0, buffer.length - i).toString()); + } else if (charStr) { + // or just emit the charStr if any + this.onString(charStr); + } +}; diff --git a/src/node.cc b/src/node.cc index d22057930f..4cff11bd85 100644 --- a/src/node.cc +++ b/src/node.cc @@ -1820,6 +1820,7 @@ static Handle Binding(const Arguments& args) { exports->Set(String::New("utils"), String::New(native_utils)); exports->Set(String::New("path"), String::New(native_path)); exports->Set(String::New("module"), String::New(native_module)); + exports->Set(String::New("utf8decoder"), String::New(native_utf8decoder)); binding_cache->Set(module, exports); } diff --git a/test/simple/test-utf8-decoder.js b/test/simple/test-utf8-decoder.js new file mode 100644 index 0000000000..fc15f224ed --- /dev/null +++ b/test/simple/test-utf8-decoder.js @@ -0,0 +1,71 @@ +require('../common'); +var Utf8Decoder = require('utf8decoder').Utf8Decoder, + Buffer = require('buffer').Buffer, + decoder = new Utf8Decoder(), + buffer, + onStringCalled = 0; + +decoder.onString = function(str) { + onStringCalled++; + assert.deepEqual(str, buffer.toString()); +}; + +buffer = new Buffer('$'); +decoder.write(buffer); +assert.equal(onStringCalled, 1); + +buffer = new Buffer('¢'); +decoder.write(buffer.slice(0, 1)); +decoder.write(buffer.slice(1, 2)); +assert.equal(onStringCalled, 2); + +buffer = new Buffer('€'); +decoder.write(buffer.slice(0, 1)); +decoder.write(buffer.slice(1, 2)); +decoder.write(buffer.slice(2, 3)); +assert.equal(onStringCalled, 3); + +buffer = new Buffer([0xF0, 0xA4, 0xAD, 0xA2]); +decoder.write(buffer.slice(0, 1)); +decoder.write(buffer.slice(1, 2)); +decoder.write(buffer.slice(2, 3)); +decoder.write(buffer.slice(3, 4)); +assert.equal(onStringCalled, 4); + +// A mixed ascii and non-ascii string +// Test stolen from deps/v8/test/cctest/test-strings.cc +// U+02E4 -> CB A4 +// U+0064 -> 64 +// U+12E4 -> E1 8B A4 +// U+0030 -> 30 +// U+3045 -> E3 81 85 +expected = "\u02e4\u0064\u12e4\u0030\u3045"; +buffer = new Buffer([0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30, 0xE3, 0x81, 0x85]); +charLengths = [0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 5, 5]; + +// Split the buffer into 3 segments +// |----|------|-------| +// 0 i j buffer.length +// Scan through every possible 3 segment combination +// and make sure that the string is always parsed. +print('scanning '); +for (var j = 2; j < buffer.length; j++) { + for (var i = 1; i < j; i++) { + var decoder = new Utf8Decoder(); + var sum = ""; + decoder.onString = function (s) { sum += s; }; + + decoder.write(buffer.slice(0, i)); + + // just check that we've received the right amount + // after the first write + assert.equal(charLengths[i], sum.length); + + decoder.write(buffer.slice(i, j)); + decoder.write(buffer.slice(j, buffer.length)); + assert.equal(expected, sum); + print("."); + } +} +puts(" crayon!"); +