From 8a945814dd61ddb547a4690788ea47cd7757f165 Mon Sep 17 00:00:00 2001 From: Brian White Date: Thu, 19 Mar 2015 17:31:34 -0400 Subject: [PATCH] string_decoder: optimize write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By limiting property getting/setting to only where they are absolutely necessary, we can achieve greater performance especially with small utf8 inputs and any size base64 inputs. PR-URL: https://github.com/iojs/io.js/pull/1209 Reviewed-By: Rod Vagg Reviewed-By: Nicu MicleuÈ™anu Reviewed-By: Chris Dickinson --- benchmark/misc/string-decoder.js | 60 ++++++++++++++++++ lib/string_decoder.js | 105 ++++++++++++++++++++----------- 2 files changed, 128 insertions(+), 37 deletions(-) create mode 100644 benchmark/misc/string-decoder.js diff --git a/benchmark/misc/string-decoder.js b/benchmark/misc/string-decoder.js new file mode 100644 index 0000000000..14563cee0d --- /dev/null +++ b/benchmark/misc/string-decoder.js @@ -0,0 +1,60 @@ +var common = require('../common.js'); +var StringDecoder = require('string_decoder').StringDecoder; + +var bench = common.createBenchmark(main, { + encoding: ['ascii', 'utf8', 'base64-utf8', 'base64-ascii'], + inlen: [32, 128, 1024], + chunk: [16, 64, 256, 1024], + n: [25e4] +}); + +var UTF_ALPHA = 'Blåbærsyltetøy'; +var ASC_ALPHA = 'Blueberry jam'; + +function main(conf) { + var encoding = conf.encoding; + var inLen = conf.inlen | 0; + var chunkLen = conf.chunk | 0; + var n = conf.n | 0; + + var alpha; + var chunks = []; + var str = ''; + var isBase64 = (encoding === 'base64-ascii' || encoding === 'base64-utf8'); + + if (encoding === 'ascii' || encoding === 'base64-ascii') + alpha = ASC_ALPHA; + else if (encoding === 'utf8' || encoding === 'base64-utf8') + alpha = UTF_ALPHA; + else + throw new Error('Bad encoding'); + + var sd = new StringDecoder(isBase64 ? 'base64' : encoding); + + for (var i = 0; i < inLen; ++i) { + if (i > 0 && (i % chunkLen) === 0 && !isBase64) { + chunks.push(new Buffer(str, encoding)); + str = ''; + } + str += alpha[i % alpha.length]; + } + if (str.length > 0 && !isBase64) + chunks.push(new Buffer(str, encoding)); + if (isBase64) { + str = new Buffer(str, 'utf8').toString('base64'); + while (str.length > 0) { + var len = Math.min(chunkLen, str.length); + chunks.push(new Buffer(str.substring(0, len), 'utf8')); + str = str.substring(len); + } + } + + var nChunks = chunks.length; + + bench.start(); + for (var i = 0; i < n; ++i) { + for (var j = 0; j < nChunks; ++j) + sd.write(chunks[j]); + } + bench.end(n); +} diff --git a/lib/string_decoder.js b/lib/string_decoder.js index ad85ee1331..61a3bb20d5 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -1,7 +1,9 @@ 'use strict'; +const isEncoding = Buffer.isEncoding; + function assertEncoding(encoding) { - if (encoding && !Buffer.isEncoding(encoding)) { + if (encoding && !isEncoding(encoding)) { throw new Error('Unknown encoding: ' + encoding); } } @@ -59,65 +61,83 @@ const StringDecoder = exports.StringDecoder = function(encoding) { // replacement character. See https://codereview.chromium.org/121173009/ . StringDecoder.prototype.write = function(buffer) { var charStr = ''; + var buflen = buffer.length; + var charBuffer = this.charBuffer; + var charLength = this.charLength; + var charReceived = this.charReceived; + var surrogateSize = this.surrogateSize; + var encoding = this.encoding; // if our last write ended with an incomplete multibyte character - while (this.charLength) { + while (charLength) { // determine how many remaining bytes this buffer has to offer for this char - var available = (buffer.length >= this.charLength - this.charReceived) ? - this.charLength - this.charReceived : - buffer.length; + var diff = charLength - charReceived; + var available = (buflen >= diff) ? diff : buflen; // add the new bytes to the char buffer - buffer.copy(this.charBuffer, this.charReceived, 0, available); - this.charReceived += available; + buffer.copy(charBuffer, charReceived, 0, available); + charReceived += available; - if (this.charReceived < this.charLength) { + if (charReceived < charLength) { // still not enough chars in this buffer? wait for more ... + + this.charLength = charLength; + this.charReceived = charReceived; + return ''; } // remove bytes belonging to the current character from the buffer - buffer = buffer.slice(available, buffer.length); + buffer = buffer.slice(available, buflen); + buflen = buffer.length; // get the character that was split - charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding); + charStr = charBuffer.toString(encoding, 0, charLength); // CESU-8: lead surrogate (D800-DBFF) is also the incomplete character var charCode = charStr.charCodeAt(charStr.length - 1); if (charCode >= 0xD800 && charCode <= 0xDBFF) { - this.charLength += this.surrogateSize; + charLength += surrogateSize; charStr = ''; continue; } - this.charReceived = this.charLength = 0; + charReceived = charLength = 0; // if there are no more bytes in this buffer, just emit our char - if (buffer.length === 0) { + if (buflen === 0) { + this.charLength = charLength; + this.charReceived = charReceived; + return charStr; } - break; } // determine and set charLength / charReceived - this.detectIncompleteChar(buffer); + if (this.detectIncompleteChar(buffer)) + charLength = this.charLength; + charReceived = this.charReceived; - var end = buffer.length; - if (this.charLength) { + var end = buflen; + if (charLength) { // buffer the incomplete character bytes we got - buffer.copy(this.charBuffer, 0, buffer.length - this.charReceived, end); - end -= this.charReceived; + buffer.copy(charBuffer, 0, buflen - charReceived, end); + end -= charReceived; } - charStr += buffer.toString(this.encoding, 0, end); + this.charLength = charLength; + charStr += buffer.toString(encoding, 0, end); var end = charStr.length - 1; var charCode = charStr.charCodeAt(end); // CESU-8: lead surrogate (D800-DBFF) is also the incomplete character if (charCode >= 0xD800 && charCode <= 0xDBFF) { - var size = this.surrogateSize; - this.charLength += size; - this.charReceived += size; - this.charBuffer.copy(this.charBuffer, size, 0, size); - buffer.copy(this.charBuffer, 0, 0, size); + charLength += surrogateSize; + charReceived += surrogateSize; + charBuffer.copy(charBuffer, surrogateSize, 0, surrogateSize); + buffer.copy(charBuffer, 0, 0, surrogateSize); + + this.charLength = charLength; + this.charReceived = charReceived; + return charStr.substring(0, end); } @@ -130,35 +150,43 @@ StringDecoder.prototype.write = function(buffer) { // length that character, and sets this.charReceived to the number of bytes // that are available for this character. StringDecoder.prototype.detectIncompleteChar = function(buffer) { + var buflen = buffer.length; // determine how many bytes we have to check at the end of this buffer - var i = (buffer.length >= 3) ? 3 : buffer.length; + var i = (buflen >= 3) ? 3 : buflen; + var newlen = false; // Figure out if one of the last i bytes of our buffer announces an // incomplete char. for (; i > 0; i--) { - var c = buffer[buffer.length - i]; + var c = buffer[buflen - i]; // See http://en.wikipedia.org/wiki/UTF-8#Description // 110XXXXX - if (i == 1 && c >> 5 == 0x06) { + if (i === 1 && c >> 5 === 0x06) { this.charLength = 2; + newlen = true; break; } // 1110XXXX - if (i <= 2 && c >> 4 == 0x0E) { + if (i <= 2 && c >> 4 === 0x0E) { this.charLength = 3; + newlen = true; break; } // 11110XXX - if (i <= 3 && c >> 3 == 0x1E) { + if (i <= 3 && c >> 3 === 0x1E) { this.charLength = 4; + newlen = true; break; } } + this.charReceived = i; + + return newlen; }; StringDecoder.prototype.end = function(buffer) { @@ -166,11 +194,12 @@ StringDecoder.prototype.end = function(buffer) { if (buffer && buffer.length) res = this.write(buffer); - if (this.charReceived) { - var cr = this.charReceived; + var charReceived = this.charReceived; + if (charReceived) { + var cr = charReceived; var buf = this.charBuffer; var enc = this.encoding; - res += buf.slice(0, cr).toString(enc); + res += buf.toString(enc, 0, cr); } return res; @@ -181,11 +210,13 @@ function passThroughWrite(buffer) { } function utf16DetectIncompleteChar(buffer) { - this.charReceived = buffer.length % 2; - this.charLength = this.charReceived ? 2 : 0; + var charReceived = this.charReceived = buffer.length % 2; + this.charLength = charReceived ? 2 : 0; + return true; } function base64DetectIncompleteChar(buffer) { - this.charReceived = buffer.length % 3; - this.charLength = this.charReceived ? 3 : 0; + var charReceived = this.charReceived = buffer.length % 3; + this.charLength = charReceived ? 3 : 0; + return true; }