src: replace naive search in Buffer::IndexOf

Adds the string search implementation from v8 which uses naive search if pattern length < 8 or to a specific badness then uses Boyer-Moore-Horspool Added benchmark shows the expected improvements Added option to use ucs2 encoding with Buffer::IndexOf Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Trevor Norris <trev.norris@gmail.com> PR-URL: https://github.com/nodejs/node/pull/2539
9 years ago · a18dd7b788
8 changed files with 4935 additions and 60 deletions
--- a/benchmark/buffers/buffer-indexof.js
+++ b/benchmark/buffers/buffer-indexof.js
@ -0,0 +1,38 @@
+var common = require('../common.js');
+var fs = require('fs');
+
+var bench = common.createBenchmark(main, {
+  search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
+           'Ou est ma chatte?', 'found it very', 'among mad people',
+           'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
+           'venture to go near the house till she had brought herself down to',
+           '</i> to the Caterpillar'],
+  encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
+  type: ['buffer', 'string'],
+  iter: [1]
+});
+
+function main(conf) {
+  var iter = (conf.iter) * 100000;
+  var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
+  var search = conf.search;
+  var encoding = conf.encoding;
+
+  if (encoding === 'undefined') {
+    encoding = undefined;
+  }
+
+  if (encoding === 'ucs2') {
+    aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
+  }
+
+  if (conf.type === 'buffer') {
+    search = new Buffer(new Buffer(search).toString(), encoding);
+  }
+
+  bench.start();
+  for (var i = 0; i < iter; i++) {
+    aliceBuffer.indexOf(search, 0, encoding);
+  }
+  bench.end(iter);
+}
--- a/benchmark/fixtures/alice.html
+++ b/benchmark/fixtures/alice.html
--- a/lib/buffer.js
+++ b/lib/buffer.js
@ -410,20 +410,53 @@ Buffer.prototype.compare = function compare(b) {
  return binding.compare(this, b);
 };

+function slowIndexOf(buffer, val, byteOffset, encoding) {
+  var loweredCase = false;
+  for (;;) {
+    switch (encoding) {
+      case 'utf8':
+      case 'utf-8':
+      case 'ucs2':
+      case 'ucs-2':
+      case 'utf16le':
+      case 'utf-16le':
+      case 'binary':
+        return binding.indexOfString(buffer, val, byteOffset, encoding);
+
+      case 'base64':
+      case 'ascii':
+      case 'hex':
+        return binding.indexOfBuffer(
+            buffer, Buffer(val, encoding), byteOffset, encoding);

-Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
+      default:
+        if (loweredCase) {
+          throw new TypeError('Unknown encoding: ' + encoding);
+        }
+
+        encoding = ('' + encoding).toLowerCase();
+        loweredCase = true;
+    }
+  }
+}
+
+Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
  if (byteOffset > 0x7fffffff)
    byteOffset = 0x7fffffff;
  else if (byteOffset < -0x80000000)
    byteOffset = -0x80000000;
  byteOffset >>= 0;

-  if (typeof val === 'string')
-    return binding.indexOfString(this, val, byteOffset);
-  if (val instanceof Buffer)
-    return binding.indexOfBuffer(this, val, byteOffset);
-  if (typeof val === 'number')
+  if (typeof val === 'string') {
+    if (encoding === undefined) {
+      return binding.indexOfString(this, val, byteOffset, encoding);
+    }
+    return slowIndexOf(this, val, byteOffset, encoding);
+  } else if (val instanceof Buffer) {
+    return binding.indexOfBuffer(this, val, byteOffset, encoding);
+  } else if (typeof val === 'number') {
    return binding.indexOfNumber(this, val, byteOffset);
+  }

  throw new TypeError('val must be string, number or Buffer');
 };
--- a/node.gyp
+++ b/node.gyp
@ -169,6 +169,7 @@
        'src/util.h',
        'src/util-inl.h',
        'src/util.cc',
+        'src/string_search.cc',
        'deps/http_parser/http_parser.h',
        'deps/v8/include/v8.h',
        'deps/v8/include/v8-debug.h',
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@ -4,6 +4,7 @@
 #include "env.h"
 #include "env-inl.h"
 #include "string_bytes.h"
+#include "string_search.h"
 #include "util.h"
 #include "util-inl.h"
 #include "v8-profiler.h"
@ -792,87 +793,156 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
 }


-int32_t IndexOf(const char* haystack,
-                size_t h_length,
-                const char* needle,
-                size_t n_length) {
-  CHECK_GE(h_length, n_length);
-  // TODO(trevnorris): Implement Boyer-Moore string search algorithm.
-  for (size_t i = 0; i < h_length - n_length + 1; i++) {
-    if (haystack[i] == needle[0]) {
-      if (memcmp(haystack + i, needle, n_length) == 0)
-        return i;
-    }
-  }
-  return -1;
-}
-
-
 void IndexOfString(const FunctionCallbackInfo<Value>& args) {
  ASSERT(args[1]->IsString());
  ASSERT(args[2]->IsNumber());

+  enum encoding enc = ParseEncoding(args.GetIsolate(),
+                                    args[3],
+                                    UTF8);
+
  THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
  SPREAD_ARG(args[0], ts_obj);

-  node::Utf8Value str(args.GetIsolate(), args[1]);
-  int32_t offset_i32 = args[2]->Int32Value();
-  uint32_t offset;
+  Local<String> needle = args[1].As<String>();
+  const char* haystack = ts_obj_data;
+  const size_t haystack_length = ts_obj_length;
+  const size_t needle_length = needle->Utf8Length();
+
+
+  if (needle_length == 0 || haystack_length == 0) {
+    return args.GetReturnValue().Set(-1);
+  }
+
+  int64_t offset_i64 = args[2]->IntegerValue();
+  size_t offset = 0;

-  if (offset_i32 < 0) {
-    if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
+  if (offset_i64 < 0) {
+    if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
      offset = 0;
-    else
-      offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
    } else {
-    offset = static_cast<uint32_t>(offset_i32);
+      offset = static_cast<size_t>(haystack_length + offset_i64);
+    }
+  } else {
+    offset = static_cast<size_t>(offset_i64);
+  }
+
+  if (haystack_length < offset || needle_length + offset > haystack_length) {
+    return args.GetReturnValue().Set(-1);
+  }
+
+  size_t result = haystack_length;
+
+  if (enc == UCS2) {
+    String::Value needle_value(needle);
+    if (*needle_value == nullptr)
+      return args.GetReturnValue().Set(-1);
+
+    if (haystack_length < 2 || needle_value.length() < 1) {
+      return args.GetReturnValue().Set(-1);
    }

-  if (str.length() == 0 ||
-      ts_obj_length == 0 ||
-      (offset != 0 && str.length() + offset <= str.length()) ||
-      str.length() + offset > ts_obj_length)
+    result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
+                          haystack_length / 2,
+                          reinterpret_cast<const uint16_t*>(*needle_value),
+                          needle_value.length(),
+                          offset / 2);
+    result *= 2;
+  } else if (enc == UTF8) {
+    String::Utf8Value needle_value(needle);
+    if (*needle_value == nullptr)
+      return args.GetReturnValue().Set(-1);
+
+    result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
+                          haystack_length,
+                          reinterpret_cast<const uint8_t*>(*needle_value),
+                          needle_length,
+                          offset);
+  } else if (enc == BINARY) {
+    uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
+    if (needle_data == nullptr) {
      return args.GetReturnValue().Set(-1);
+    }
+    needle->WriteOneByte(
+        needle_data, 0, needle_length, String::NO_NULL_TERMINATION);

-  int32_t r =
-      IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
-  args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
+    result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
+                          haystack_length,
+                          needle_data,
+                          needle_length,
+                          offset);
+    free(needle_data);
  }

+  args.GetReturnValue().Set(
+      result == haystack_length ? -1 : static_cast<int>(result));
+}

 void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
  ASSERT(args[1]->IsObject());
  ASSERT(args[2]->IsNumber());

+  enum encoding enc = ParseEncoding(args.GetIsolate(),
+                                    args[3],
+                                    UTF8);
+
  THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
  SPREAD_ARG(args[0], ts_obj);
  SPREAD_ARG(args[1], buf);
-  const int32_t offset_i32 = args[2]->Int32Value();
-  uint32_t offset;

  if (buf_length > 0)
    CHECK_NE(buf_data, nullptr);

-  if (offset_i32 < 0) {
-    if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
+  const char* haystack = ts_obj_data;
+  const size_t haystack_length = ts_obj_length;
+  const char* needle = buf_data;
+  const size_t needle_length = buf_length;
+
+  if (needle_length == 0 || haystack_length == 0) {
+    return args.GetReturnValue().Set(-1);
+  }
+
+  int64_t offset_i64 = args[2]->IntegerValue();
+  size_t offset = 0;
+
+  if (offset_i64 < 0) {
+    if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
      offset = 0;
    else
-      offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
+      offset = static_cast<size_t>(haystack_length + offset_i64);
  } else {
-    offset = static_cast<uint32_t>(offset_i32);
+    offset = static_cast<size_t>(offset_i64);
  }

-  if (buf_length == 0 ||
-      ts_obj_length == 0 ||
-      (offset != 0 && buf_length + offset <= buf_length) ||
-      buf_length + offset > ts_obj_length)
+  if (haystack_length < offset || needle_length + offset > haystack_length) {
    return args.GetReturnValue().Set(-1);
+  }
+
+  size_t result = haystack_length;

-  int32_t r =
-    IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
-  args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
+  if (enc == UCS2) {
+    if (haystack_length < 2 || needle_length < 2) {
+      return args.GetReturnValue().Set(-1);
+    }
+    result = SearchString(
+        reinterpret_cast<const uint16_t*>(haystack),
+        haystack_length / 2,
+        reinterpret_cast<const uint16_t*>(needle),
+        needle_length / 2,
+        offset / 2);
+    result *= 2;
+  } else {
+    result = SearchString(
+        reinterpret_cast<const uint8_t*>(haystack),
+        haystack_length,
+        reinterpret_cast<const uint8_t*>(needle),
+        needle_length,
+        offset);
  }

+  args.GetReturnValue().Set(
+      result == haystack_length ? -1 : static_cast<int>(result));
+}

 void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
  ASSERT(args[1]->IsNumber());
@ -882,16 +952,16 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
  SPREAD_ARG(args[0], ts_obj);

  uint32_t needle = args[1]->Uint32Value();
-  int32_t offset_i32 = args[2]->Int32Value();
-  uint32_t offset;
+  int64_t offset_i64 = args[2]->IntegerValue();
+  size_t offset;

-  if (offset_i32 < 0) {
-    if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
+  if (offset_i64 < 0) {
+    if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
      offset = 0;
    else
-      offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
+      offset = static_cast<size_t>(ts_obj_length + offset_i64);
  } else {
-    offset = static_cast<uint32_t>(offset_i32);
+    offset = static_cast<size_t>(offset_i64);
  }

  if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
@ -899,8 +969,8 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {

  void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
  char* ptr_char = static_cast<char*>(ptr);
-  args.GetReturnValue().Set(
-      ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
+  args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
+                                : -1);
 }


--- a/src/string_search.cc
+++ b/src/string_search.cc
@ -0,0 +1,10 @@
+#include "string_search.h"
+
+namespace node {
+namespace stringsearch {
+
+int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
+int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
+int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
+}
+}  // namespace node::stringsearch
--- a/src/string_search.h
+++ b/src/string_search.h
@ -0,0 +1,671 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SRC_STRING_SEARCH_H_
+#define SRC_STRING_SEARCH_H_
+
+#include "node.h"
+#include <string.h>
+
+namespace node {
+namespace stringsearch {
+
+
+// Returns the maximum of the two parameters.
+template <typename T>
+T Max(T a, T b) {
+  return a < b ? b : a;
+}
+
+
+static const uint32_t kMaxOneByteCharCodeU = 0xff;
+
+
+static inline size_t NonOneByteStart(const uint16_t* chars, size_t length) {
+  const uint16_t* limit = chars + length;
+  const uint16_t* start = chars;
+  while (chars < limit) {
+    if (*chars > kMaxOneByteCharCodeU)
+      return static_cast<size_t>(chars - start);
+    ++chars;
+  }
+  return static_cast<size_t>(chars - start);
+}
+
+
+static inline bool IsOneByte(const uint16_t* chars, size_t length) {
+  return NonOneByteStart(chars, length) >= length;
+}
+
+
+template <typename T>
+class Vector {
+ public:
+  Vector(T* data, size_t length) : start_(data), length_(length) {
+    ASSERT(length > 0 && data != nullptr);
+  }
+
+  // Returns the length of the vector.
+  size_t length() const { return length_; }
+
+  T* start() const { return start_; }
+
+  // Access individual vector elements - checks bounds in debug mode.
+  T& operator[](size_t index) const {
+    ASSERT(0 <= index && index < length_);
+    return start_[index];
+  }
+
+  const T& at(size_t index) const { return operator[](index); }
+
+  bool operator==(const Vector<T>& other) const {
+    if (length_ != other.length_)
+      return false;
+    if (start_ == other.start_)
+      return true;
+    for (size_t i = 0; i < length_; ++i) {
+      if (start_[i] != other.start_[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  T* start_;
+  size_t length_;
+};
+
+
+//---------------------------------------------------------------------
+// String Search object.
+//---------------------------------------------------------------------
+
+// Class holding constants and methods that apply to all string search variants,
+// independently of subject and pattern char size.
+class StringSearchBase {
+ protected:
+  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
+  // limit, we can fix the size of tables. For a needle longer than this limit,
+  // search will not be optimal, since we only build tables for a suffix
+  // of the string, but it is a safe approximation.
+  static const int kBMMaxShift = 250;
+
+  // Reduce alphabet to this size.
+  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
+  // proportional to the input alphabet. We reduce the alphabet size by
+  // equating input characters modulo a smaller alphabet size. This gives
+  // a potentially less efficient searching, but is a safe approximation.
+  // For needles using only characters in the same Unicode 256-code point page,
+  // there is no search speed degradation.
+  static const int kLatin1AlphabetSize = 256;
+  static const int kUC16AlphabetSize = 256;
+
+  // Bad-char shift table stored in the state. It's length is the alphabet size.
+  // For patterns below this length, the skip length of Boyer-Moore is too short
+  // to compensate for the algorithmic overhead compared to simple brute force.
+  static const int kBMMinPatternLength = 8;
+
+  // Store for the BoyerMoore(Horspool) bad char shift table.
+  static int kBadCharShiftTable[kUC16AlphabetSize];
+  // Store for the BoyerMoore good suffix shift table.
+  static int kGoodSuffixShiftTable[kBMMaxShift + 1];
+  // Table used temporarily while building the BoyerMoore good suffix
+  // shift table.
+  static int kSuffixTable[kBMMaxShift + 1];
+
+  static inline bool IsOneByteString(Vector<const uint8_t> string) {
+    return true;
+  }
+
+  static inline bool IsOneByteString(Vector<const uint16_t> string) {
+    return IsOneByte(string.start(), string.length());
+  }
+};
+
+template <typename PatternChar, typename SubjectChar>
+class StringSearch : private StringSearchBase {
+ public:
+  explicit StringSearch(Vector<const PatternChar> pattern)
+      : pattern_(pattern), start_(0) {
+    if (pattern.length() >= kBMMaxShift) {
+      start_ = pattern.length() - kBMMaxShift;
+    }
+
+    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
+      if (!IsOneByteString(pattern_)) {
+        strategy_ = &FailSearch;
+        return;
+      }
+    }
+    size_t pattern_length = pattern_.length();
+    CHECK_GT(pattern_length, 0);
+    if (pattern_length < kBMMinPatternLength) {
+      if (pattern_length == 1) {
+        strategy_ = &SingleCharSearch;
+        return;
+      }
+      strategy_ = &LinearSearch;
+      return;
+    }
+    strategy_ = &InitialSearch;
+  }
+
+  size_t Search(Vector<const SubjectChar> subject, size_t index) {
+    return strategy_(this, subject, index);
+  }
+
+  static inline int AlphabetSize() {
+    if (sizeof(PatternChar) == 1) {
+      // Latin1 needle.
+      return kLatin1AlphabetSize;
+    } else {
+      // UC16 needle.
+      return kUC16AlphabetSize;
+    }
+
+    static_assert(sizeof(PatternChar) == sizeof(uint8_t) ||
+                      sizeof(PatternChar) == sizeof(uint16_t),
+                  "sizeof(PatternChar) == sizeof(uint16_t) || sizeof(uint8_t)");
+  }
+
+ private:
+  typedef size_t (*SearchFunction)(  // NOLINT - it's not a cast!
+      StringSearch<PatternChar, SubjectChar>*,
+      Vector<const SubjectChar>,
+      size_t);
+
+  static size_t FailSearch(StringSearch<PatternChar, SubjectChar>*,
+                           Vector<const SubjectChar> subject,
+                           size_t) {
+    return subject.length();
+  }
+
+  static size_t SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
+                                 Vector<const SubjectChar> subject,
+                                 size_t start_index);
+
+  static size_t LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
+                             Vector<const SubjectChar> subject,
+                             size_t start_index);
+
+  static size_t InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
+                              Vector<const SubjectChar> subject,
+                              size_t start_index);
+
+  static size_t BoyerMooreHorspoolSearch(
+      StringSearch<PatternChar, SubjectChar>* search,
+      Vector<const SubjectChar> subject,
+      size_t start_index);
+
+  static size_t BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
+                                 Vector<const SubjectChar> subject,
+                                 size_t start_index);
+
+  void PopulateBoyerMooreHorspoolTable();
+
+  void PopulateBoyerMooreTable();
+
+  static inline bool exceedsOneByte(uint8_t c) { return false; }
+
+  static inline bool exceedsOneByte(uint16_t c) {
+    return c > kMaxOneByteCharCodeU;
+  }
+
+  static inline int CharOccurrence(int* bad_char_occurrence,
+                                   SubjectChar char_code) {
+    if (sizeof(SubjectChar) == 1) {
+      return bad_char_occurrence[static_cast<int>(char_code)];
+    }
+    if (sizeof(PatternChar) == 1) {
+      if (exceedsOneByte(char_code)) {
+        return -1;
+      }
+      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
+    }
+    // Both pattern and subject are UC16. Reduce character to equivalence class.
+    int equiv_class = char_code % kUC16AlphabetSize;
+    return bad_char_occurrence[equiv_class];
+  }
+
+  // Store for the BoyerMoore(Horspool) bad char shift table.
+  // Return a table covering the last kBMMaxShift+1 positions of
+  // pattern.
+  int* bad_char_table() { return kBadCharShiftTable; }
+
+  // Store for the BoyerMoore good suffix shift table.
+  int* good_suffix_shift_table() {
+    // Return biased pointer that maps the range  [start_..pattern_.length()
+    // to the kGoodSuffixShiftTable array.
+    return kGoodSuffixShiftTable - start_;
+  }
+
+  // Table used temporarily while building the BoyerMoore good suffix
+  // shift table.
+  int* suffix_table() {
+    // Return biased pointer that maps the range  [start_..pattern_.length()
+    // to the kSuffixTable array.
+    return kSuffixTable - start_;
+  }
+
+  // The pattern to search for.
+  Vector<const PatternChar> pattern_;
+  // Pointer to implementation of the search.
+  SearchFunction strategy_;
+  // Cache value of Max(0, pattern_length() - kBMMaxShift)
+  size_t start_;
+};
+
+
+template <typename T, typename U>
+inline T AlignDown(T value, U alignment) {
+  return reinterpret_cast<T>(
+      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
+}
+
+
+inline uint8_t GetHighestValueByte(uint16_t character) {
+  return Max(static_cast<uint8_t>(character & 0xFF),
+             static_cast<uint8_t>(character >> 8));
+}
+
+
+inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
+
+
+template <typename PatternChar, typename SubjectChar>
+inline size_t FindFirstCharacter(Vector<const PatternChar> pattern,
+                              Vector<const SubjectChar> subject, size_t index) {
+  const PatternChar pattern_first_char = pattern[0];
+  const size_t max_n = (subject.length() - pattern.length() + 1);
+
+  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
+  const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
+  size_t pos = index;
+  do {
+    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
+        memchr(subject.start() + pos, search_byte,
+               (max_n - pos) * sizeof(SubjectChar)));
+    if (char_pos == nullptr)
+      return subject.length();
+    char_pos = AlignDown(char_pos, sizeof(SubjectChar));
+    pos = static_cast<size_t>(char_pos - subject.start());
+    if (subject[pos] == search_char)
+      return pos;
+  } while (++pos < max_n);
+
+  return subject.length();
+}
+
+
+template <>
+inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
+                                 Vector<const uint8_t> subject,
+                                 size_t index) {
+  const uint8_t pattern_first_char = pattern[0];
+  const size_t max_n = (subject.length() - pattern.length() + 1);
+
+  const uint8_t* char_pos = reinterpret_cast<const uint8_t*>(
+      memchr(subject.start() + index, pattern_first_char, max_n - index));
+  if (char_pos == nullptr)
+    return subject.length();
+  return static_cast<size_t>(char_pos - subject.start());
+}
+
+//---------------------------------------------------------------------
+// Single Character Pattern Search Strategy
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+size_t StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
+    StringSearch<PatternChar, SubjectChar>* search,
+    Vector<const SubjectChar> subject,
+    size_t index) {
+  CHECK_EQ(1, search->pattern_.length());
+  PatternChar pattern_first_char = search->pattern_[0];
+
+  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+    return FindFirstCharacter(search->pattern_, subject, index);
+  } else {
+    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
+      if (exceedsOneByte(pattern_first_char)) {
+        return -1;
+      }
+    }
+    return FindFirstCharacter(search->pattern_, subject, index);
+  }
+}
+
+//---------------------------------------------------------------------
+// Linear Search Strategy
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+inline bool CharCompare(const PatternChar* pattern,
+                        const SubjectChar* subject,
+                        size_t length) {
+  ASSERT_GT(length, 0);
+  size_t pos = 0;
+  do {
+    if (pattern[pos] != subject[pos]) {
+      return false;
+    }
+    pos++;
+  } while (pos < length);
+  return true;
+}
+
+// Simple linear search for short patterns. Never bails out.
+template <typename PatternChar, typename SubjectChar>
+size_t StringSearch<PatternChar, SubjectChar>::LinearSearch(
+    StringSearch<PatternChar, SubjectChar>* search,
+    Vector<const SubjectChar> subject,
+    size_t index) {
+  Vector<const PatternChar> pattern = search->pattern_;
+  CHECK_GT(pattern.length(), 1);
+  const size_t pattern_length = pattern.length();
+  size_t i = index;
+  const size_t n = subject.length() - pattern_length;
+  while (i <= n) {
+    i = FindFirstCharacter(pattern, subject, i);
+    if (i == subject.length())
+      return subject.length();
+    ASSERT_LE(i, n);
+    i++;
+
+    // Loop extracted to separate function to allow using return to do
+    // a deeper break.
+    if (CharCompare(pattern.start() + 1, subject.start() + i,
+                    pattern_length - 1)) {
+      return i - 1;
+    }
+  }
+  return subject.length();
+}
+
+//---------------------------------------------------------------------
+// Boyer-Moore string search
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
+    StringSearch<PatternChar, SubjectChar>* search,
+    Vector<const SubjectChar> subject,
+    size_t start_index) {
+  Vector<const PatternChar> pattern = search->pattern_;
+  const size_t subject_length = subject.length();
+  const size_t pattern_length = pattern.length();
+  // Only preprocess at most kBMMaxShift last characters of pattern.
+  size_t start = search->start_;
+
+  int* bad_char_occurence = search->bad_char_table();
+  int* good_suffix_shift = search->good_suffix_shift_table();
+
+  PatternChar last_char = pattern[pattern_length - 1];
+  size_t index = start_index;
+  // Continue search from i.
+  while (index <= subject_length - pattern_length) {
+    size_t j = pattern_length - 1;
+    int c;
+    while (last_char != (c = subject[index + j])) {
+      int shift = j - CharOccurrence(bad_char_occurence, c);
+      index += shift;
+      if (index > subject_length - pattern_length) {
+        return subject.length();
+      }
+    }
+    while (j >= 0 && pattern[j] == (c = subject[index + j])) {
+      if (j == 0) {
+        return index;
+      }
+      j--;
+    }
+    if (j < start) {
+      // we have matched more than our tables allow us to be smart about.
+      // Fall back on BMH shift.
+      index += pattern_length - 1 -
+               CharOccurrence(bad_char_occurence,
+                              static_cast<SubjectChar>(last_char));
+    } else {
+      int gs_shift = good_suffix_shift[j + 1];
+      int bc_occ = CharOccurrence(bad_char_occurence, c);
+      int shift = j - bc_occ;
+      if (gs_shift > shift) {
+        shift = gs_shift;
+      }
+      index += shift;
+    }
+  }
+
+  return subject.length();
+}
+
+template <typename PatternChar, typename SubjectChar>
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
+  const size_t pattern_length = pattern_.length();
+  const PatternChar* pattern = pattern_.start();
+  // Only look at the last kBMMaxShift characters of pattern (from start_
+  // to pattern_length).
+  const size_t start = start_;
+  const size_t length = pattern_length - start;
+
+  // Biased tables so that we can use pattern indices as table indices,
+  // even if we only cover the part of the pattern from offset start.
+  int* shift_table = good_suffix_shift_table();
+  int* suffix_table = this->suffix_table();
+
+  // Initialize table.
+  for (size_t i = start; i < pattern_length; i++) {
+    shift_table[i] = length;
+  }
+  shift_table[pattern_length] = 1;
+  suffix_table[pattern_length] = pattern_length + 1;
+
+  if (pattern_length <= start) {
+    return;
+  }
+
+  // Find suffixes.
+  PatternChar last_char = pattern[pattern_length - 1];
+  size_t suffix = pattern_length + 1;
+  {
+    size_t i = pattern_length;
+    while (i > start) {
+      PatternChar c = pattern[i - 1];
+      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
+        if (static_cast<size_t>(shift_table[suffix]) == length) {
+          shift_table[suffix] = suffix - i;
+        }
+        suffix = suffix_table[suffix];
+      }
+      suffix_table[--i] = --suffix;
+      if (suffix == pattern_length) {
+        // No suffix to extend, so we check against last_char only.
+        while ((i > start) && (pattern[i - 1] != last_char)) {
+          if (static_cast<size_t>(shift_table[pattern_length]) == length) {
+            shift_table[pattern_length] = pattern_length - i;
+          }
+          suffix_table[--i] = pattern_length;
+        }
+        if (i > start) {
+          suffix_table[--i] = --suffix;
+        }
+      }
+    }
+  }
+  // Build shift table using suffixes.
+  if (suffix < pattern_length) {
+    for (size_t i = start; i <= pattern_length; i++) {
+      if (static_cast<size_t>(shift_table[i]) == length) {
+        shift_table[i] = suffix - start;
+      }
+      if (i == suffix) {
+        suffix = suffix_table[suffix];
+      }
+    }
+  }
+}
+
+//---------------------------------------------------------------------
+// Boyer-Moore-Horspool string search.
+//---------------------------------------------------------------------
+
+template <typename PatternChar, typename SubjectChar>
+size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
+    StringSearch<PatternChar, SubjectChar>* search,
+    Vector<const SubjectChar> subject,
+    size_t start_index) {
+  Vector<const PatternChar> pattern = search->pattern_;
+  const size_t subject_length = subject.length();
+  const size_t pattern_length = pattern.length();
+  int* char_occurrences = search->bad_char_table();
+  int64_t badness = -pattern_length;
+
+  // How bad we are doing without a good-suffix table.
+  PatternChar last_char = pattern[pattern_length - 1];
+  int last_char_shift =
+      pattern_length - 1 -
+      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
+
+  // Perform search
+  size_t index = start_index;  // No matches found prior to this index.
+  while (index <= subject_length - pattern_length) {
+    size_t j = pattern_length - 1;
+    int subject_char;
+    while (last_char != (subject_char = subject[index + j])) {
+      int bc_occ = CharOccurrence(char_occurrences, subject_char);
+      int shift = j - bc_occ;
+      index += shift;
+      badness += 1 - shift;  // at most zero, so badness cannot increase.
+      if (index > subject_length - pattern_length) {
+        return subject_length;
+      }
+    }
+    j--;
+    while (j >= 0 && pattern[j] == (subject[index + j])) {
+      if (j == 0) {
+        return index;
+      }
+      j--;
+    }
+    index += last_char_shift;
+    // Badness increases by the number of characters we have
+    // checked, and decreases by the number of characters we
+    // can skip by shifting. It's a measure of how we are doing
+    // compared to reading each character exactly once.
+    badness += (pattern_length - j) - last_char_shift;
+    if (badness > 0) {
+      search->PopulateBoyerMooreTable();
+      search->strategy_ = &BoyerMooreSearch;
+      return BoyerMooreSearch(search, subject, index);
+    }
+  }
+  return subject.length();
+}
+
+template <typename PatternChar, typename SubjectChar>
+void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
+  const size_t pattern_length = pattern_.length();
+
+  int* bad_char_occurrence = bad_char_table();
+
+  // Only preprocess at most kBMMaxShift last characters of pattern.
+  const size_t start = start_;
+  // Run forwards to populate bad_char_table, so that *last* instance
+  // of character equivalence class is the one registered.
+  // Notice: Doesn't include the last character.
+  const size_t table_size = AlphabetSize();
+  if (start == 0) {
+    // All patterns less than kBMMaxShift in length.
+    memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
+  } else {
+    for (size_t i = 0; i < table_size; i++) {
+      bad_char_occurrence[i] = start - 1;
+    }
+  }
+  for (size_t i = start; i < pattern_length - 1; i++) {
+    PatternChar c = pattern_[i];
+    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
+    bad_char_occurrence[bucket] = i;
+  }
+}
+
+//---------------------------------------------------------------------
+// Linear string search with bailout to BMH.
+//---------------------------------------------------------------------
+
+// Simple linear search for short patterns, which bails out if the string
+// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
+template <typename PatternChar, typename SubjectChar>
+size_t StringSearch<PatternChar, SubjectChar>::InitialSearch(
+    StringSearch<PatternChar, SubjectChar>* search,
+    Vector<const SubjectChar> subject,
+    size_t index) {
+  Vector<const PatternChar> pattern = search->pattern_;
+  const size_t pattern_length = pattern.length();
+  // Badness is a count of how much work we have done.  When we have
+  // done enough work we decide it's probably worth switching to a better
+  // algorithm.
+  int64_t badness = -10 - (pattern_length << 2);
+
+  // We know our pattern is at least 2 characters, we cache the first so
+  // the common case of the first character not matching is faster.
+  for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
+    badness++;
+    if (badness <= 0) {
+      i = FindFirstCharacter(pattern, subject, i);
+      if (i == subject.length())
+        return subject.length();
+      ASSERT_LE(i, n);
+      size_t j = 1;
+      do {
+        if (pattern[j] != subject[i + j]) {
+          break;
+        }
+        j++;
+      } while (j < pattern_length);
+      if (j == pattern_length) {
+        return i;
+      }
+      badness += j;
+    } else {
+      search->PopulateBoyerMooreHorspoolTable();
+      search->strategy_ = &BoyerMooreHorspoolSearch;
+      return BoyerMooreHorspoolSearch(search, subject, i);
+    }
+  }
+  return subject.length();
+}
+
+// Perform a a single stand-alone search.
+// If searching multiple times for the same pattern, a search
+// object should be constructed once and the Search function then called
+// for each search.
+template <typename SubjectChar, typename PatternChar>
+size_t SearchString(Vector<const SubjectChar> subject,
+                    Vector<const PatternChar> pattern,
+                    size_t start_index) {
+  StringSearch<PatternChar, SubjectChar> search(pattern);
+  return search.Search(subject, start_index);
+}
+}
+}  // namespace node::stringsearch
+
+namespace node {
+using node::stringsearch::Vector;
+
+template <typename SubjectChar, typename PatternChar>
+size_t SearchString(const SubjectChar* haystack,
+                    size_t haystack_length,
+                    const PatternChar* needle,
+                    size_t needle_length,
+                    size_t start_index) {
+  return node::stringsearch::SearchString(
+      Vector<const SubjectChar>(haystack, haystack_length),
+      Vector<const PatternChar>(needle, needle_length),
+      start_index);
+}
+}  // namespace node
+
+#endif  // SRC_STRING_SEARCH_H_
--- a/test/parallel/test-buffer-indexof.js
+++ b/test/parallel/test-buffer-indexof.js
@ -65,6 +65,193 @@ assert.equal(b.indexOf(0x61, -Infinity), 0);
 assert.equal(b.indexOf(0x61, Infinity), -1);
 assert.equal(b.indexOf(0x0), -1);

+// test offsets
+assert.equal(b.indexOf('d', 2), 3);
+assert.equal(b.indexOf('f', 5), 5);
+assert.equal(b.indexOf('f', -1), 5);
+assert.equal(b.indexOf('f', 6), -1);
+
+assert.equal(b.indexOf(Buffer('d'), 2), 3);
+assert.equal(b.indexOf(Buffer('f'), 5), 5);
+assert.equal(b.indexOf(Buffer('f'), -1), 5);
+assert.equal(b.indexOf(Buffer('f'), 6), -1);
+
+assert.equal(Buffer('ff').indexOf(Buffer('f'), 1, 'ucs2'), -1);
+
+// test hex encoding
+assert.equal(
+    Buffer(b.toString('hex'), 'hex')
+    .indexOf('64', 0, 'hex'), 3);
+assert.equal(
+    Buffer(b.toString('hex'), 'hex')
+    .indexOf(Buffer('64', 'hex'), 0, 'hex'), 3);
+
+// test base64 encoding
+assert.equal(
+    Buffer(b.toString('base64'), 'base64')
+    .indexOf('ZA==', 0, 'base64'), 3);
+assert.equal(
+    Buffer(b.toString('base64'), 'base64')
+    .indexOf(Buffer('ZA==', 'base64'), 0, 'base64'), 3);
+
+// test ascii encoding
+assert.equal(
+    Buffer(b.toString('ascii'), 'ascii')
+    .indexOf('d', 0, 'ascii'), 3);
+assert.equal(
+    Buffer(b.toString('ascii'), 'ascii')
+    .indexOf(Buffer('d', 'ascii'), 0, 'ascii'), 3);
+
+// test binary encoding
+assert.equal(
+    Buffer(b.toString('binary'), 'binary')
+    .indexOf('d', 0, 'binary'), 3);
+assert.equal(
+    Buffer(b.toString('binary'), 'binary')
+    .indexOf(Buffer('d', 'binary'), 0, 'binary'), 3);
+
+
+// test usc2 encoding
+var twoByteString = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2');
+
+assert.equal(8, twoByteString.indexOf('\u0395', 4, 'ucs2'));
+assert.equal(6, twoByteString.indexOf('\u03a3', -4, 'ucs2'));
+assert.equal(4, twoByteString.indexOf('\u03a3', -6, 'ucs2'));
+assert.equal(4, twoByteString.indexOf(
+  new Buffer('\u03a3', 'ucs2'), -6, 'ucs2'));
+assert.equal(-1, twoByteString.indexOf('\u03a3', -2, 'ucs2'));
+
+var mixedByteStringUcs2 =
+    new Buffer('\u039a\u0391abc\u03a3\u03a3\u0395', 'ucs2');
+assert.equal(6, mixedByteStringUcs2.indexOf('bc', 0, 'ucs2'));
+assert.equal(10, mixedByteStringUcs2.indexOf('\u03a3', 0, 'ucs2'));
+assert.equal(-1, mixedByteStringUcs2.indexOf('\u0396', 0, 'ucs2'));
+
+assert.equal(
+    6, mixedByteStringUcs2.indexOf(new Buffer('bc', 'ucs2'), 0, 'ucs2'));
+assert.equal(
+    10, mixedByteStringUcs2.indexOf(new Buffer('\u03a3', 'ucs2'), 0, 'ucs2'));
+assert.equal(
+    -1, mixedByteStringUcs2.indexOf(new Buffer('\u0396', 'ucs2'), 0, 'ucs2'));
+
+var twoByteString = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2');
+
+// Test single char pattern
+assert.equal(0, twoByteString.indexOf('\u039a', 0, 'ucs2'));
+assert.equal(2, twoByteString.indexOf('\u0391', 0, 'ucs2'), 'Alpha');
+assert.equal(4, twoByteString.indexOf('\u03a3', 0, 'ucs2'), 'First Sigma');
+assert.equal(6, twoByteString.indexOf('\u03a3', 6, 'ucs2'), 'Second Sigma');
+assert.equal(8, twoByteString.indexOf('\u0395', 0, 'ucs2'), 'Epsilon');
+assert.equal(-1, twoByteString.indexOf('\u0392', 0, 'ucs2'), 'Not beta');
+
+// Test multi-char pattern
+assert.equal(
+    0, twoByteString.indexOf('\u039a\u0391', 0, 'ucs2'), 'Lambda Alpha');
+assert.equal(
+    2, twoByteString.indexOf('\u0391\u03a3', 0, 'ucs2'), 'Alpha Sigma');
+assert.equal(
+    4, twoByteString.indexOf('\u03a3\u03a3', 0, 'ucs2'), 'Sigma Sigma');
+assert.equal(
+    6, twoByteString.indexOf('\u03a3\u0395', 0, 'ucs2'), 'Sigma Epsilon');
+
+var mixedByteStringUtf8 = new Buffer('\u039a\u0391abc\u03a3\u03a3\u0395');
+assert.equal(5, mixedByteStringUtf8.indexOf('bc'));
+assert.equal(5, mixedByteStringUtf8.indexOf('bc', 5));
+assert.equal(5, mixedByteStringUtf8.indexOf('bc', -8));
+assert.equal(7, mixedByteStringUtf8.indexOf('\u03a3'));
+assert.equal(-1, mixedByteStringUtf8.indexOf('\u0396'));
+
+
+// Test complex string indexOf algorithms. Only trigger for long strings.
+// Long string that isn't a simple repeat of a shorter string.
+var longString = 'A';
+for (var i = 66; i < 76; i++) {  // from 'B' to 'K'
+  longString =  longString + String.fromCharCode(i) + longString;
+}
+
+var longBufferString = new Buffer(longString);
+
+// pattern of 15 chars, repeated every 16 chars in long
+var pattern = 'ABACABADABACABA';
+for (var i = 0; i < longBufferString.length - pattern.length; i += 7) {
+  var index = longBufferString.indexOf(pattern, i);
+  assert.equal((i + 15) & ~0xf, index, 'Long ABACABA...-string at index ' + i);
+}
+assert.equal(510, longBufferString.indexOf('AJABACA'), 'Long AJABACA, First J');
+assert.equal(
+    1534, longBufferString.indexOf('AJABACA', 511), 'Long AJABACA, Second J');
+
+pattern = 'JABACABADABACABA';
+assert.equal(
+    511, longBufferString.indexOf(pattern), 'Long JABACABA..., First J');
+assert.equal(
+    1535, longBufferString.indexOf(pattern, 512), 'Long JABACABA..., Second J');
+
+// Search for a non-ASCII string in a pure ASCII string.
+var asciiString = new Buffer(
+    'arglebargleglopglyfarglebargleglopglyfarglebargleglopglyf');
+assert.equal(-1, asciiString.indexOf('\x2061'));
+assert.equal(3, asciiString.indexOf('leb', 0));
+
+// Search in string containing many non-ASCII chars.
+var allCodePoints = [];
+for (var i = 0; i < 65536; i++) allCodePoints[i] = i;
+var allCharsString = String.fromCharCode.apply(String, allCodePoints);
+var allCharsBufferUtf8 = new Buffer(allCharsString);
+var allCharsBufferUcs2 = new Buffer(allCharsString, 'ucs2');
+
+// Search for string long enough to trigger complex search with ASCII pattern
+// and UC16 subject.
+assert.equal(-1, allCharsBufferUtf8.indexOf('notfound'));
+assert.equal(-1, allCharsBufferUcs2.indexOf('notfound'));
+
+// Find substrings in Utf8.
+var lengths = [1, 3, 15];  // Single char, simple and complex.
+var indices = [0x5, 0x60, 0x400, 0x680, 0x7ee, 0xFF02, 0x16610, 0x2f77b];
+for (var lengthIndex = 0; lengthIndex < lengths.length; lengthIndex++) {
+  for (var i = 0; i < indices.length; i++) {
+    var index = indices[i];
+    var length = lengths[lengthIndex];
+
+    if (index + length > 0x7F) {
+      length = 2 * length;
+    }
+
+    if (index + length > 0x7FF) {
+      length = 3 * length;
+    }
+
+    if (index + length > 0xFFFF) {
+      length = 4 * length;
+    }
+
+    var patternBufferUtf8 = allCharsBufferUtf8.slice(index, index + length);
+    assert.equal(index, allCharsBufferUtf8.indexOf(patternBufferUtf8));
+
+    var patternStringUtf8 = patternBufferUtf8.toString();
+    assert.equal(index, allCharsBufferUtf8.indexOf(patternStringUtf8));
+  }
+}
+
+// Find substrings in Usc2.
+var lengths = [2, 4, 16];  // Single char, simple and complex.
+var indices = [0x5, 0x65, 0x105, 0x205, 0x285, 0x2005, 0x2085, 0xfff0];
+for (var lengthIndex = 0; lengthIndex < lengths.length; lengthIndex++) {
+  for (var i = 0; i < indices.length; i++) {
+    var index = indices[i] * 2;
+    var length = lengths[lengthIndex];
+
+    var patternBufferUcs2 =
+        allCharsBufferUcs2.slice(index, index + length);
+    assert.equal(
+        index, allCharsBufferUcs2.indexOf(patternBufferUcs2, 0, 'ucs2'));
+
+    var patternStringUcs2 = patternBufferUcs2.toString('ucs2');
+    assert.equal(
+        index, allCharsBufferUcs2.indexOf(patternStringUcs2, 0, 'ucs2'));
+  }
+}
+
 assert.throws(function() {
  b.indexOf(function() { });
 });