Browse Source

buffer: speed up ascii character scanning

Speed up ASCII character scanning and conversion by 25% to 30% by scanning and
converting whole words instead of individual bytes.
v0.10.0-release
Ben Noordhuis 12 years ago
committed by isaacs
parent
commit
e325ace53c
  1. 92
      src/node_buffer.cc
  2. 6
      src/node_internals.h
  3. 18
      test/simple/test-buffer-ascii.js

92
src/node_buffer.cc

@ -28,6 +28,7 @@
#include <assert.h>
#include <string.h> // memcpy
#include <limits.h>
#define MIN(a,b) ((a) < (b) ? (a) : (b))
@ -247,7 +248,7 @@ Handle<Value> Buffer::BinarySlice(const Arguments &args) {
}
static bool contains_non_ascii(const char* buf, size_t len) {
static bool contains_non_ascii_slow(const char* buf, size_t len) {
for (size_t i = 0; i < len; ++i) {
if (buf[i] & 0x80) return true;
}
@ -255,13 +256,100 @@ static bool contains_non_ascii(const char* buf, size_t len) {
}
static void force_ascii(const char* src, char* dst, size_t len) {
static bool contains_non_ascii(const char* src, size_t len) {
if (len < 16) {
return contains_non_ascii_slow(src, len);
}
const unsigned bytes_per_word = BITS_PER_LONG / CHAR_BIT;
const unsigned align_mask = bytes_per_word - 1;
const unsigned unaligned = reinterpret_cast<uintptr_t>(src) & align_mask;
if (unaligned > 0) {
const unsigned n = bytes_per_word - unaligned;
if (contains_non_ascii_slow(src, n)) return true;
src += n;
len -= n;
}
#if BITS_PER_LONG == 64
typedef uint64_t word;
const uint64_t mask = 0x8080808080808080ll;
#else
typedef uint32_t word;
const uint32_t mask = 0x80808080l;
#endif
const word* srcw = reinterpret_cast<const word*>(src);
for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) {
if (srcw[i] & mask) return true;
}
const unsigned remainder = len & align_mask;
if (remainder > 0) {
const size_t offset = len - remainder;
if (contains_non_ascii_slow(src + offset, remainder)) return true;
}
return false;
}
static void force_ascii_slow(const char* src, char* dst, size_t len) {
for (size_t i = 0; i < len; ++i) {
dst[i] = src[i] & 0x7f;
}
}
static void force_ascii(const char* src, char* dst, size_t len) {
if (len < 16) {
force_ascii_slow(src, dst, len);
return;
}
const unsigned bytes_per_word = BITS_PER_LONG / CHAR_BIT;
const unsigned align_mask = bytes_per_word - 1;
const unsigned src_unalign = reinterpret_cast<uintptr_t>(src) & align_mask;
const unsigned dst_unalign = reinterpret_cast<uintptr_t>(dst) & align_mask;
if (src_unalign > 0) {
if (src_unalign == dst_unalign) {
const unsigned unalign = bytes_per_word - src_unalign;
force_ascii_slow(src, dst, unalign);
src += unalign;
dst += unalign;
len -= src_unalign;
} else {
force_ascii_slow(src, dst, len);
return;
}
}
#if BITS_PER_LONG == 64
typedef uint64_t word;
const uint64_t mask = ~0x8080808080808080ll;
#else
typedef uint32_t word;
const uint32_t mask = ~0x80808080l;
#endif
const word* srcw = reinterpret_cast<const word*>(src);
word* dstw = reinterpret_cast<word*>(dst);
for (size_t i = 0, n = len / bytes_per_word; i < n; ++i) {
dstw[i] = srcw[i] & mask;
}
const unsigned remainder = len & align_mask;
if (remainder > 0) {
const size_t offset = len - remainder;
force_ascii_slow(src + offset, dst + offset, remainder);
}
}
Handle<Value> Buffer::AsciiSlice(const Arguments &args) {
HandleScope scope;
Buffer *parent = ObjectWrap::Unwrap<Buffer>(args.This());

6
src/node_internals.h

@ -48,6 +48,12 @@ inline static int snprintf(char* buf, unsigned int len, const char* fmt, ...) {
}
#endif
#if defined(__x86_64__)
# define BITS_PER_LONG 64
#else
# define BITS_PER_LONG 32
#endif
#ifndef offset_of
// g++ in strict mode complains loudly about the system offsetof() macro
// because it uses NULL as the base address.

18
test/simple/test-buffer-ascii.js

@ -25,3 +25,21 @@ var assert = require('assert');
// ASCII conversion in node.js simply masks off the high bits,
// it doesn't do transliteration.
assert.equal(Buffer('hérité').toString('ascii'), 'hC)ritC)');
// 71 characters, 78 bytes. The ’ character is a triple-byte sequence.
var input = 'C’est, graphiquement, la réunion d’un accent aigu ' +
'et d’un accent grave.';
var expected = 'Cb\u0000\u0019est, graphiquement, la rC)union ' +
'db\u0000\u0019un accent aigu et db\u0000\u0019un ' +
'accent grave.';
var buf = Buffer(input);
for (var i = 0; i < expected.length; ++i) {
assert.equal(buf.slice(i).toString('ascii'), expected.slice(i));
// Skip remainder of multi-byte sequence.
if (input.charCodeAt(i) > 65535) ++i;
if (input.charCodeAt(i) > 127) ++i;
}

Loading…
Cancel
Save