Browse Source

buffer: speed up swap16/32, add swap64

* Speed up buffer.swap16 and swap32 by using builtins. Up to ~6x gain.
  Drop transition point between JS and C++ implementations accordingly.
  Amount of performance improvement not only depends on buffer size but
  also memory alignment.
* Fix tests: C++ impl tests were testing 0-filled buffers so were
  always passing.
* Add similar buffer.swap64 method.
* Make buffer-swap benchmark mirror JS impl.

doc/api/buffer.markdown has an entry of "added: REPLACEME" that should
be changed to the correct release number before tagged.

Because node is currently using a very old version of it
doesn't know that std::swap() has moved from <algorithm> to <utility> in
c++11. So until is updated simply NOLINT the line.
Technically it should be NOLINT(build/include_what_you_use), but that
puts the line over 80 characters causing another lint error.

Reviewed-By: Trevor Norris <>
Reviewed-By: James M Snell <>
Zach Bjornson 9 years ago
committed by Trevor Norris
  1. 75
  2. 32
  3. 105
  4. 116
  5. 130


@ -1,61 +1,90 @@
'use strict';
const common = require('../common.js');
const v8 = require('v8');
const bench = common.createBenchmark(main, {
method: ['swap16', 'swap32', 'htons', 'htonl'],
len: [4, 64, 512, 768, 1024, 1536, 2056, 4096, 8192],
n: [1e6]
aligned: ['true', 'false'],
method: ['swap16', 'swap32', 'swap64'/*, 'htons', 'htonl', 'htonll'*/],
len: [8, 64, 128, 256, 512, 768, 1024, 1536, 2056, 4096, 8192],
n: [5e7]
// The htons and htonl methods below are used to benchmark the
// performance difference between doing the byteswap in pure
// javascript regardless of Buffer size as opposed to dropping
// down to the native layer for larger Buffer sizes.
// down to the native layer for larger Buffer sizes. Commented
// out by default because they are slow for big buffers. If
// re-evaluating the crossover point, uncomment those methods
// and comment out their implementations in lib/buffer.js so
// C++ version will always be used.
function swap(b, n, m) {
const i = b[n];
b[n] = b[m];
b[m] = i;
Buffer.prototype.htons = function htons() {
if (this.length % 2 !== 0)
throw new RangeError();
for (var i = 0, n = 0; i < this.length; i += 2) {
n = this[i];
this[i] = this[i + 1];
this[i + 1] = n;
for (var i = 0; i < this.length; i += 2) {
swap(this, i, i + 1);
return this;
Buffer.prototype.htonl = function htonl() {
if (this.length % 2 !== 0)
if (this.length % 4 !== 0)
throw new RangeError();
for (var i = 0; i < this.length; i += 4) {
swap(this, i, i + 3);
swap(this, i + 1, i + 2);
return this;
Buffer.prototype.htonll = function htonl() {
if (this.length % 8 !== 0)
throw new RangeError();
for (var i = 0, n = 0; i < this.length; i += 4) {
n = this[i];
this[i] = this[i + 3];
this[i + 3] = n;
n = this[i + 1];
this[i + 1] = this[i + 2];
this[i + 2] = n;
for (var i = 0; i < this.length; i += 8) {
swap(this, i, i + 7);
swap(this, i + 1, i + 6);
swap(this, i + 2, i + 5);
swap(this, i + 3, i + 4);
return this;
function createBuffer(len) {
function createBuffer(len, aligned) {
len += aligned ? 0 : 1;
const buf = Buffer.allocUnsafe(len);
for (var i = 1; i <= len; i++)
buf[i - 1] = i;
return buf;
return aligned ? buf : buf.slice(1);
function bufferSwap(n, buf, method) {
for (var i = 1; i <= n; i++)
function genMethod(method) {
const fnString =
'return function ' + method + '(n, buf) {' +
' for (var i = 0; i <= n; i++)' +
' buf.' + method + '();' +
return (new Function(fnString))();
function main(conf) {
const method = conf.method;
const len = conf.len | 0;
const n = conf.n | 0;
const buf = createBuffer(len);
const aligned = conf.aligned || 'true';
const buf = createBuffer(len, aligned === 'true');
const bufferSwap = genMethod(method);
bufferSwap(n, buf, method);
bufferSwap(n, buf);


@ -1470,10 +1470,10 @@ calls can be chained.
const buf = Buffer.from([0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8]);
// Prints Buffer(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8)
// Prints <Buffer 01 02 03 04 05 06 07 08>
// Prints Buffer(0x2, 0x1, 0x4, 0x3, 0x6, 0x5, 0x8, 0x7)
// Prints <Buffer 02 01 04 03 06 05 08 07>
### buf.swap32()
@ -1491,12 +1491,36 @@ calls can be chained.
const buf = Buffer.from([0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8]);
// Prints Buffer(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8)
// Prints <Buffer 01 02 03 04 05 06 07 08>
// Prints Buffer(0x4, 0x3, 0x2, 0x1, 0x8, 0x7, 0x6, 0x5)
// Prints <Buffer 04 03 02 01 08 07 06 05>
### buf.swap64()
<!-- YAML
* Return: {Buffer}
Interprets the `Buffer` as an array of 64-bit numbers and swaps
the byte-order *in-place*. Throws a `RangeError` if the `Buffer` length is
not a multiple of 64 bits. The method returns a reference to the Buffer, so
calls can be chained.
const buf = Buffer.from([0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8]);
// Prints <Buffer 01 02 03 04 05 06 07 08>
// Prints <Buffer 08 07 06 05 04 03 02 01>
Note that JavaScript cannot encode 64-bit integers. This method is intended
for working with 64-bit floats.
### buf.toString([encoding[, start[, end]]])
* `encoding` {String} Default: `'utf8'`


@ -25,47 +25,6 @@ var poolSize, poolOffset, allocPool;
binding.setupBufferJS(Buffer.prototype, bindingObj);
const swap16n = binding.swap16;
const swap32n = binding.swap32;
function swap(b, n, m) {
const i = b[n];
b[n] = b[m];
b[m] = i;
Buffer.prototype.swap16 = function swap16() {
// For Buffer.length < 512, it's generally faster to
// do the swap in javascript. For larger buffers,
// dropping down to the native code is faster.
const len = this.length;
if (len % 2 !== 0)
throw new RangeError('Buffer size must be a multiple of 16-bits');
if (len < 512) {
for (var i = 0; i < len; i += 2)
swap(this, i, i + 1);
return this;
return swap16n.apply(this);
Buffer.prototype.swap32 = function swap32() {
// For Buffer.length < 1024, it's generally faster to
// do the swap in javascript. For larger buffers,
// dropping down to the native code is faster.
const len = this.length;
if (len % 4 !== 0)
throw new RangeError('Buffer size must be a multiple of 32-bits');
if (len < 1024) {
for (var i = 0; i < len; i += 4) {
swap(this, i, i + 3);
swap(this, i + 1, i + 2);
return this;
return swap32n.apply(this);
// |binding.zeroFill| can be undefined when running inside an isolate where we
// do not own the ArrayBuffer allocator. Zero fill is always on in that case.
const zeroFill = bindingObj.zeroFill || [0];
@ -1303,3 +1262,67 @@ Buffer.prototype.writeDoubleBE = function writeDoubleBE(val, offset, noAssert) {
binding.writeDoubleBE(this, val, offset, true);
return offset + 8;
const swap16n = binding.swap16;
const swap32n = binding.swap32;
const swap64n = binding.swap64;
function swap(b, n, m) {
const i = b[n];
b[n] = b[m];
b[m] = i;
Buffer.prototype.swap16 = function swap16() {
// For Buffer.length < 128, it's generally faster to
// do the swap in javascript. For larger buffers,
// dropping down to the native code is faster.
const len = this.length;
if (len % 2 !== 0)
throw new RangeError('Buffer size must be a multiple of 16-bits');
if (len < 128) {
for (var i = 0; i < len; i += 2)
swap(this, i, i + 1);
return this;
return swap16n(this);
Buffer.prototype.swap32 = function swap32() {
// For Buffer.length < 192, it's generally faster to
// do the swap in javascript. For larger buffers,
// dropping down to the native code is faster.
const len = this.length;
if (len % 4 !== 0)
throw new RangeError('Buffer size must be a multiple of 32-bits');
if (len < 192) {
for (var i = 0; i < len; i += 4) {
swap(this, i, i + 3);
swap(this, i + 1, i + 2);
return this;
return swap32n(this);
Buffer.prototype.swap64 = function swap64() {
// For Buffer.length < 192, it's generally faster to
// do the swap in javascript. For larger buffers,
// dropping down to the native code is faster.
const len = this.length;
if (len % 8 !== 0)
throw new RangeError('Buffer size must be a multiple of 64-bits');
if (len < 192) {
for (var i = 0; i < len; i += 8) {
swap(this, i, i + 7);
swap(this, i + 1, i + 6);
swap(this, i + 2, i + 5);
swap(this, i + 3, i + 4);
return this;
return swap64n(this);


@ -12,6 +12,7 @@
#include <string.h>
#include <limits.h>
#include <utility>
#define BUFFER_ID 0xB0E4
@ -51,12 +52,37 @@
#define BUFFER_MALLOC(length) \
zero_fill_all_buffers ? calloc(length, 1) : malloc(length)
#define SWAP_BYTES(arr, a, b) \
do { \
const uint8_t lo = arr[a]; \
arr[a] = arr[b]; \
arr[b] = lo; \
} while (0)
#if defined(__GNUC__) || defined(__clang__)
#define BSWAP_INTRINSIC_2(x) __builtin_bswap16(x)
#define BSWAP_INTRINSIC_4(x) __builtin_bswap32(x)
#define BSWAP_INTRINSIC_8(x) __builtin_bswap64(x)
#elif defined(__linux__)
#include <byteswap.h>
#define BSWAP_INTRINSIC_2(x) bswap_16(x)
#define BSWAP_INTRINSIC_4(x) bswap_32(x)
#define BSWAP_INTRINSIC_8(x) bswap_64(x)
#elif defined(_MSC_VER)
#include <intrin.h>
#define BSWAP_INTRINSIC_2(x) _byteswap_ushort(x);
#define BSWAP_INTRINSIC_4(x) _byteswap_ulong(x);
#define BSWAP_INTRINSIC_8(x) _byteswap_uint64(x);
#define BSWAP_INTRINSIC_2(x) ((x) << 8) | ((x) >> 8)
#define BSWAP_INTRINSIC_4(x) \
(((x) & 0xFF) << 24) | \
(((x) & 0xFF00) << 8) | \
(((x) >> 8) & 0xFF00) | \
(((x) >> 24) & 0xFF)
#define BSWAP_INTRINSIC_8(x) \
(((x) & 0xFF00000000000000ull) >> 56) | \
(((x) & 0x00FF000000000000ull) >> 40) | \
(((x) & 0x0000FF0000000000ull) >> 24) | \
(((x) & 0x000000FF00000000ull) >> 8) | \
(((x) & 0x00000000FF000000ull) << 8) | \
(((x) & 0x0000000000FF0000ull) << 24) | \
(((x) & 0x000000000000FF00ull) << 40) | \
(((x) & 0x00000000000000FFull) << 56)
namespace node {
@ -1150,29 +1176,88 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
: -1);
void Swap16(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
SPREAD_ARG(args.This(), ts_obj);
SPREAD_ARG(args[0], ts_obj);
CHECK_EQ(ts_obj_length % 2, 0);
int align = reinterpret_cast<uintptr_t>(ts_obj_data) % sizeof(uint16_t);
if (align == 0) {
uint16_t* data16 = reinterpret_cast<uint16_t*>(ts_obj_data);
size_t len16 = ts_obj_length / 2;
for (size_t i = 0; i < len16; i++) {
data16[i] = BSWAP_INTRINSIC_2(data16[i]);
} else {
for (size_t i = 0; i < ts_obj_length; i += 2) {
SWAP_BYTES(ts_obj_data, i, i + 1);
std::swap(ts_obj_data[i], ts_obj_data[i + 1]);
void Swap32(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
SPREAD_ARG(args.This(), ts_obj);
SPREAD_ARG(args[0], ts_obj);
CHECK_EQ(ts_obj_length % 4, 0);
int align = reinterpret_cast<uintptr_t>(ts_obj_data) % sizeof(uint32_t);
if (align == 0) {
uint32_t* data32 = reinterpret_cast<uint32_t*>(ts_obj_data);
size_t len32 = ts_obj_length / 4;
for (size_t i = 0; i < len32; i++) {
data32[i] = BSWAP_INTRINSIC_4(data32[i]);
} else {
for (size_t i = 0; i < ts_obj_length; i += 4) {
SWAP_BYTES(ts_obj_data, i, i + 3);
SWAP_BYTES(ts_obj_data, i + 1, i + 2);
std::swap(ts_obj_data[i], ts_obj_data[i + 3]);
std::swap(ts_obj_data[i + 1], ts_obj_data[i + 2]);
void Swap64(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
SPREAD_ARG(args[0], ts_obj);
CHECK_EQ(ts_obj_length % 8, 0);
int align = reinterpret_cast<uintptr_t>(ts_obj_data) % sizeof(uint64_t);
if (align == 0) {
uint64_t* data64 = reinterpret_cast<uint64_t*>(ts_obj_data);
size_t len32 = ts_obj_length / 8;
for (size_t i = 0; i < len32; i++) {
data64[i] = BSWAP_INTRINSIC_8(data64[i]);
} else {
for (size_t i = 0; i < ts_obj_length; i += 8) {
std::swap(ts_obj_data[i], ts_obj_data[i + 7]);
std::swap(ts_obj_data[i + 1], ts_obj_data[i + 6]);
std::swap(ts_obj_data[i + 2], ts_obj_data[i + 5]);
// NOLINT added because current is old and doesn't know that
// std::swap() now lives in <utility> instead of <algorithm>.
std::swap(ts_obj_data[i + 3], ts_obj_data[i + 4]); // NOLINT
// pass Buffer object to load prototype methods
void SetupBufferJS(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
@ -1238,6 +1323,7 @@ void Initialize(Local<Object> target,
env->SetMethod(target, "swap16", Swap16);
env->SetMethod(target, "swap32", Swap32);
env->SetMethod(target, "swap64", Swap64);
FIXED_ONE_BYTE_STRING(env->isolate(), "kMaxLength"),


@ -3,31 +3,28 @@
const assert = require('assert');
const buf = Buffer.from([0x1, 0x2, 0x3, 0x4]);
// Test buffers small enough to use the JS implementation
const buf = Buffer.from([0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10]);
assert.strictEqual(buf, buf.swap16());
assert.deepStrictEqual(buf, Buffer.from([0x2, 0x1, 0x4, 0x3]));
assert.deepStrictEqual(buf, Buffer.from([0x02, 0x01, 0x04, 0x03, 0x06, 0x05,
0x08, 0x07, 0x0a, 0x09, 0x0c, 0x0b,
0x0e, 0x0d, 0x10, 0x0f]));
buf.swap16(); // restore
assert.strictEqual(buf, buf.swap32());
assert.deepStrictEqual(buf, Buffer.from([0x3, 0x4, 0x1, 0x2]));
const buf_array = [];
for (var i = 1; i < 33; i++)
const buf2 = Buffer.from(buf_array);
Buffer.from([0x04, 0x03, 0x02, 0x01, 0x08, 0x07, 0x06, 0x05, 0x0c,
0x0b, 0x0a, 0x09, 0x10, 0x0f, 0x0e, 0x0d, 0x14, 0x13,
0x12, 0x11, 0x18, 0x17, 0x16, 0x15, 0x1c, 0x1b, 0x1a,
0x19, 0x20, 0x1f, 0x1e, 0x1d]));
Buffer.from([0x03, 0x04, 0x01, 0x02, 0x07, 0x08, 0x05, 0x06, 0x0b,
0x0c, 0x09, 0x0a, 0x0f, 0x10, 0x0d, 0x0e, 0x13, 0x14,
0x11, 0x12, 0x17, 0x18, 0x15, 0x16, 0x1b, 0x1c, 0x19,
0x1a, 0x1f, 0x20, 0x1d, 0x1e]));
assert.deepStrictEqual(buf, Buffer.from([0x04, 0x03, 0x02, 0x01, 0x08, 0x07,
0x06, 0x05, 0x0c, 0x0b, 0x0a, 0x09,
0x10, 0x0f, 0x0e, 0x0d]));
buf.swap32(); // restore
assert.strictEqual(buf, buf.swap64());
assert.deepStrictEqual(buf, Buffer.from([0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
0x02, 0x01, 0x10, 0x0f, 0x0e, 0x0d,
0x0c, 0x0b, 0x0a, 0x09]));
// Operates in-place
const buf3 = Buffer.from([0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7]);
buf3.slice(1, 5).swap32();
assert.deepStrictEqual(buf3, Buffer.from([0x1, 0x5, 0x4, 0x3, 0x2, 0x6, 0x7]));
@ -35,26 +32,107 @@ assert.deepStrictEqual(buf3, Buffer.from([0x1, 0x5, 0x4, 0x3, 0x2, 0x6, 0x7]));
buf3.slice(1, 5).swap16();
assert.deepStrictEqual(buf3, Buffer.from([0x1, 0x4, 0x5, 0x2, 0x3, 0x6, 0x7]));
const buf3_64 = Buffer.from([0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10]);
buf3_64.slice(2, 18).swap64();
assert.deepStrictEqual(buf3_64, Buffer.from([0x01, 0x02, 0x0a, 0x09, 0x08, 0x07,
0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b,
0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
0x0f, 0x10]));
// Force use of native code (Buffer size above threshold limit for js impl)
const buf4 = Buffer.allocUnsafe(1024).fill([0x1, 0x2, 0x3, 0x4]);
const buf5 = Buffer.allocUnsafe(1024).fill([0x2, 0x1, 0x4, 0x3]);
const buf6 = Buffer.allocUnsafe(1024)
.fill([0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8]);
const buf7 = Buffer.allocUnsafe(1024)
.fill([0x4, 0x3, 0x2, 0x1, 0x8, 0x7, 0x6, 0x5]);
var buf4A = new Uint32Array(256).fill(0x04030201);
var buf4 = Buffer.from(buf4A.buffer, buf4A.byteOffset);
var buf5A = new Uint32Array(256).fill(0x03040102);
var buf5 = Buffer.from(buf5A.buffer, buf5A.byteOffset);
assert.deepStrictEqual(buf4, buf5);
var buf6A = new Uint32Array(256).fill(0x04030201);
var buf6 = Buffer.from(buf6A.buffer);
var bu7A = new Uint32Array(256).fill(0x01020304);
var buf7 = Buffer.from(bu7A.buffer, bu7A.byteOffset);
assert.deepStrictEqual(buf6, buf7);
var buf8A = new Uint8Array(256 * 8);
var buf9A = new Uint8Array(256 * 8);
for (let i = 0; i < buf8A.length; i++) {
buf8A[i] = i % 8;
buf9A[buf9A.length - i - 1] = i % 8;
var buf8 = Buffer.from(buf8A.buffer, buf8A.byteOffset);
var buf9 = Buffer.from(buf9A.buffer, buf9A.byteOffset);
assert.deepStrictEqual(buf8, buf9);
// Test native code with buffers that are not memory-aligned
var buf10A = new Uint8Array(256 * 8);
var buf11A = new Uint8Array(256 * 8 - 2);
for (let i = 0; i < buf10A.length; i++) {
buf10A[i] = i % 2;
for (let i = 1; i < buf11A.length; i++) {
buf11A[buf11A.length - i] = (i + 1) % 2;
var buf10 = Buffer.from(buf10A.buffer, buf10A.byteOffset);
// 0|1 0|1 0|1...
var buf11 = Buffer.from(buf11A.buffer, buf11A.byteOffset);
// 0|0 1|0 1|0...
buf10.slice(1, buf10.length - 1).swap16();
assert.deepStrictEqual(buf10.slice(0, buf11.length), buf11);
var buf12A = new Uint8Array(256 * 8);
var buf13A = new Uint8Array(256 * 8 - 4);
for (let i = 0; i < buf12A.length; i++) {
buf12A[i] = i % 4;
for (let i = 1; i < buf13A.length; i++) {
buf13A[buf13A.length - i] = (i + 1) % 4;
var buf12 = Buffer.from(buf12A.buffer, buf12A.byteOffset);
// 0|1 2 3 0|1 2 3...
var buf13 = Buffer.from(buf13A.buffer, buf13A.byteOffset);
// 0|0 3 2 1|0 3 2...
buf12.slice(1, buf12.length - 3).swap32();
assert.deepStrictEqual(buf12.slice(0, buf13.length), buf13);
var buf14A = new Uint8Array(256 * 8);
var buf15A = new Uint8Array(256 * 8 - 8);
for (let i = 0; i < buf14A.length; i++) {
buf14A[i] = i % 8;
for (let i = 1; i < buf15A.length; i++) {
buf15A[buf15A.length - i] = (i + 1) % 8;
var buf14 = Buffer.from(buf14A.buffer, buf14A.byteOffset);
// 0|1 2 3 4 5 6 7 0|1 2 3 4...
var buf15 = Buffer.from(buf15A.buffer, buf15A.byteOffset);
// 0|0 7 6 5 4 3 2 1|0 7 6 5...
buf14.slice(1, buf14.length - 7).swap64();
assert.deepStrictEqual(buf14.slice(0, buf15.length), buf15);
// Length assertions
const re16 = /Buffer size must be a multiple of 16-bits/;
const re32 = /Buffer size must be a multiple of 32-bits/;
const re64 = /Buffer size must be a multiple of 64-bits/;
assert.throws(() => Buffer.from(buf3).swap16(), re16);
assert.throws(() => Buffer.alloc(1025).swap16(), re16);
assert.throws(() => Buffer.from(buf3).swap32(), re32);
assert.throws(() => buf3.slice(1, 3).swap32(), re32);
assert.throws(() => Buffer.alloc(1025).swap32(), re32);
assert.throws(() => buf3.slice(1, 3).swap64(), re64);
assert.throws(() => Buffer.alloc(1025).swap64(), re64);
