mirror of https://github.com/lukechilds/node.git
Browse Source
Adds the string search implementation from v8 which uses naive search if pattern length < 8 or to a specific badness then uses Boyer-Moore-Horspool Added benchmark shows the expected improvements Added option to use ucs2 encoding with Buffer::IndexOf Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Trevor Norris <trev.norris@gmail.com> PR-URL: https://github.com/nodejs/node/pull/2539v5.x
Karl Skomski
9 years ago
committed by
James M Snell
8 changed files with 4935 additions and 60 deletions
@ -0,0 +1,38 @@ |
|||||
|
var common = require('../common.js'); |
||||
|
var fs = require('fs'); |
||||
|
|
||||
|
var bench = common.createBenchmark(main, { |
||||
|
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther', |
||||
|
'Ou est ma chatte?', 'found it very', 'among mad people', |
||||
|
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa', |
||||
|
'venture to go near the house till she had brought herself down to', |
||||
|
'</i> to the Caterpillar'], |
||||
|
encoding: ['undefined', 'utf8', 'ucs2', 'binary'], |
||||
|
type: ['buffer', 'string'], |
||||
|
iter: [1] |
||||
|
}); |
||||
|
|
||||
|
function main(conf) { |
||||
|
var iter = (conf.iter) * 100000; |
||||
|
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html'); |
||||
|
var search = conf.search; |
||||
|
var encoding = conf.encoding; |
||||
|
|
||||
|
if (encoding === 'undefined') { |
||||
|
encoding = undefined; |
||||
|
} |
||||
|
|
||||
|
if (encoding === 'ucs2') { |
||||
|
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding); |
||||
|
} |
||||
|
|
||||
|
if (conf.type === 'buffer') { |
||||
|
search = new Buffer(new Buffer(search).toString(), encoding); |
||||
|
} |
||||
|
|
||||
|
bench.start(); |
||||
|
for (var i = 0; i < iter; i++) { |
||||
|
aliceBuffer.indexOf(search, 0, encoding); |
||||
|
} |
||||
|
bench.end(iter); |
||||
|
} |
File diff suppressed because it is too large
@ -0,0 +1,10 @@ |
|||||
|
#include "string_search.h" |
||||
|
|
||||
|
namespace node { |
||||
|
namespace stringsearch { |
||||
|
|
||||
|
int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize]; |
||||
|
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1]; |
||||
|
int StringSearchBase::kSuffixTable[kBMMaxShift + 1]; |
||||
|
} |
||||
|
} // namespace node::stringsearch
|
@ -0,0 +1,671 @@ |
|||||
|
// Copyright 2011 the V8 project authors. All rights reserved.
|
||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||
|
// found in the LICENSE file.
|
||||
|
|
||||
|
#ifndef SRC_STRING_SEARCH_H_ |
||||
|
#define SRC_STRING_SEARCH_H_ |
||||
|
|
||||
|
#include "node.h" |
||||
|
#include <string.h> |
||||
|
|
||||
|
namespace node { |
||||
|
namespace stringsearch { |
||||
|
|
||||
|
|
||||
|
// Returns the maximum of the two parameters.
|
||||
|
template <typename T> |
||||
|
T Max(T a, T b) { |
||||
|
return a < b ? b : a; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
static const uint32_t kMaxOneByteCharCodeU = 0xff; |
||||
|
|
||||
|
|
||||
|
static inline size_t NonOneByteStart(const uint16_t* chars, size_t length) { |
||||
|
const uint16_t* limit = chars + length; |
||||
|
const uint16_t* start = chars; |
||||
|
while (chars < limit) { |
||||
|
if (*chars > kMaxOneByteCharCodeU) |
||||
|
return static_cast<size_t>(chars - start); |
||||
|
++chars; |
||||
|
} |
||||
|
return static_cast<size_t>(chars - start); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
static inline bool IsOneByte(const uint16_t* chars, size_t length) { |
||||
|
return NonOneByteStart(chars, length) >= length; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
template <typename T> |
||||
|
class Vector { |
||||
|
public: |
||||
|
Vector(T* data, size_t length) : start_(data), length_(length) { |
||||
|
ASSERT(length > 0 && data != nullptr); |
||||
|
} |
||||
|
|
||||
|
// Returns the length of the vector.
|
||||
|
size_t length() const { return length_; } |
||||
|
|
||||
|
T* start() const { return start_; } |
||||
|
|
||||
|
// Access individual vector elements - checks bounds in debug mode.
|
||||
|
T& operator[](size_t index) const { |
||||
|
ASSERT(0 <= index && index < length_); |
||||
|
return start_[index]; |
||||
|
} |
||||
|
|
||||
|
const T& at(size_t index) const { return operator[](index); } |
||||
|
|
||||
|
bool operator==(const Vector<T>& other) const { |
||||
|
if (length_ != other.length_) |
||||
|
return false; |
||||
|
if (start_ == other.start_) |
||||
|
return true; |
||||
|
for (size_t i = 0; i < length_; ++i) { |
||||
|
if (start_[i] != other.start_[i]) { |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
private: |
||||
|
T* start_; |
||||
|
size_t length_; |
||||
|
}; |
||||
|
|
||||
|
|
||||
|
//---------------------------------------------------------------------
|
||||
|
// String Search object.
|
||||
|
//---------------------------------------------------------------------
|
||||
|
|
||||
|
// Class holding constants and methods that apply to all string search variants,
|
||||
|
// independently of subject and pattern char size.
|
||||
|
class StringSearchBase { |
||||
|
protected: |
||||
|
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a
|
||||
|
// limit, we can fix the size of tables. For a needle longer than this limit,
|
||||
|
// search will not be optimal, since we only build tables for a suffix
|
||||
|
// of the string, but it is a safe approximation.
|
||||
|
static const int kBMMaxShift = 250; |
||||
|
|
||||
|
// Reduce alphabet to this size.
|
||||
|
// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
|
||||
|
// proportional to the input alphabet. We reduce the alphabet size by
|
||||
|
// equating input characters modulo a smaller alphabet size. This gives
|
||||
|
// a potentially less efficient searching, but is a safe approximation.
|
||||
|
// For needles using only characters in the same Unicode 256-code point page,
|
||||
|
// there is no search speed degradation.
|
||||
|
static const int kLatin1AlphabetSize = 256; |
||||
|
static const int kUC16AlphabetSize = 256; |
||||
|
|
||||
|
// Bad-char shift table stored in the state. It's length is the alphabet size.
|
||||
|
// For patterns below this length, the skip length of Boyer-Moore is too short
|
||||
|
// to compensate for the algorithmic overhead compared to simple brute force.
|
||||
|
static const int kBMMinPatternLength = 8; |
||||
|
|
||||
|
// Store for the BoyerMoore(Horspool) bad char shift table.
|
||||
|
static int kBadCharShiftTable[kUC16AlphabetSize]; |
||||
|
// Store for the BoyerMoore good suffix shift table.
|
||||
|
static int kGoodSuffixShiftTable[kBMMaxShift + 1]; |
||||
|
// Table used temporarily while building the BoyerMoore good suffix
|
||||
|
// shift table.
|
||||
|
static int kSuffixTable[kBMMaxShift + 1]; |
||||
|
|
||||
|
static inline bool IsOneByteString(Vector<const uint8_t> string) { |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
static inline bool IsOneByteString(Vector<const uint16_t> string) { |
||||
|
return IsOneByte(string.start(), string.length()); |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
class StringSearch : private StringSearchBase { |
||||
|
public: |
||||
|
explicit StringSearch(Vector<const PatternChar> pattern) |
||||
|
: pattern_(pattern), start_(0) { |
||||
|
if (pattern.length() >= kBMMaxShift) { |
||||
|
start_ = pattern.length() - kBMMaxShift; |
||||
|
} |
||||
|
|
||||
|
if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
||||
|
if (!IsOneByteString(pattern_)) { |
||||
|
strategy_ = &FailSearch; |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
size_t pattern_length = pattern_.length(); |
||||
|
CHECK_GT(pattern_length, 0); |
||||
|
if (pattern_length < kBMMinPatternLength) { |
||||
|
if (pattern_length == 1) { |
||||
|
strategy_ = &SingleCharSearch; |
||||
|
return; |
||||
|
} |
||||
|
strategy_ = &LinearSearch; |
||||
|
return; |
||||
|
} |
||||
|
strategy_ = &InitialSearch; |
||||
|
} |
||||
|
|
||||
|
size_t Search(Vector<const SubjectChar> subject, size_t index) { |
||||
|
return strategy_(this, subject, index); |
||||
|
} |
||||
|
|
||||
|
static inline int AlphabetSize() { |
||||
|
if (sizeof(PatternChar) == 1) { |
||||
|
// Latin1 needle.
|
||||
|
return kLatin1AlphabetSize; |
||||
|
} else { |
||||
|
// UC16 needle.
|
||||
|
return kUC16AlphabetSize; |
||||
|
} |
||||
|
|
||||
|
static_assert(sizeof(PatternChar) == sizeof(uint8_t) || |
||||
|
sizeof(PatternChar) == sizeof(uint16_t), |
||||
|
"sizeof(PatternChar) == sizeof(uint16_t) || sizeof(uint8_t)"); |
||||
|
} |
||||
|
|
||||
|
private: |
||||
|
typedef size_t (*SearchFunction)( // NOLINT - it's not a cast!
|
||||
|
StringSearch<PatternChar, SubjectChar>*, |
||||
|
Vector<const SubjectChar>, |
||||
|
size_t); |
||||
|
|
||||
|
static size_t FailSearch(StringSearch<PatternChar, SubjectChar>*, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t) { |
||||
|
return subject.length(); |
||||
|
} |
||||
|
|
||||
|
static size_t SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index); |
||||
|
|
||||
|
static size_t LinearSearch(StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index); |
||||
|
|
||||
|
static size_t InitialSearch(StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index); |
||||
|
|
||||
|
static size_t BoyerMooreHorspoolSearch( |
||||
|
StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index); |
||||
|
|
||||
|
static size_t BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index); |
||||
|
|
||||
|
void PopulateBoyerMooreHorspoolTable(); |
||||
|
|
||||
|
void PopulateBoyerMooreTable(); |
||||
|
|
||||
|
static inline bool exceedsOneByte(uint8_t c) { return false; } |
||||
|
|
||||
|
static inline bool exceedsOneByte(uint16_t c) { |
||||
|
return c > kMaxOneByteCharCodeU; |
||||
|
} |
||||
|
|
||||
|
static inline int CharOccurrence(int* bad_char_occurrence, |
||||
|
SubjectChar char_code) { |
||||
|
if (sizeof(SubjectChar) == 1) { |
||||
|
return bad_char_occurrence[static_cast<int>(char_code)]; |
||||
|
} |
||||
|
if (sizeof(PatternChar) == 1) { |
||||
|
if (exceedsOneByte(char_code)) { |
||||
|
return -1; |
||||
|
} |
||||
|
return bad_char_occurrence[static_cast<unsigned int>(char_code)]; |
||||
|
} |
||||
|
// Both pattern and subject are UC16. Reduce character to equivalence class.
|
||||
|
int equiv_class = char_code % kUC16AlphabetSize; |
||||
|
return bad_char_occurrence[equiv_class]; |
||||
|
} |
||||
|
|
||||
|
// Store for the BoyerMoore(Horspool) bad char shift table.
|
||||
|
// Return a table covering the last kBMMaxShift+1 positions of
|
||||
|
// pattern.
|
||||
|
int* bad_char_table() { return kBadCharShiftTable; } |
||||
|
|
||||
|
// Store for the BoyerMoore good suffix shift table.
|
||||
|
int* good_suffix_shift_table() { |
||||
|
// Return biased pointer that maps the range [start_..pattern_.length()
|
||||
|
// to the kGoodSuffixShiftTable array.
|
||||
|
return kGoodSuffixShiftTable - start_; |
||||
|
} |
||||
|
|
||||
|
// Table used temporarily while building the BoyerMoore good suffix
|
||||
|
// shift table.
|
||||
|
int* suffix_table() { |
||||
|
// Return biased pointer that maps the range [start_..pattern_.length()
|
||||
|
// to the kSuffixTable array.
|
||||
|
return kSuffixTable - start_; |
||||
|
} |
||||
|
|
||||
|
// The pattern to search for.
|
||||
|
Vector<const PatternChar> pattern_; |
||||
|
// Pointer to implementation of the search.
|
||||
|
SearchFunction strategy_; |
||||
|
// Cache value of Max(0, pattern_length() - kBMMaxShift)
|
||||
|
size_t start_; |
||||
|
}; |
||||
|
|
||||
|
|
||||
|
template <typename T, typename U> |
||||
|
inline T AlignDown(T value, U alignment) { |
||||
|
return reinterpret_cast<T>( |
||||
|
(reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
inline uint8_t GetHighestValueByte(uint16_t character) { |
||||
|
return Max(static_cast<uint8_t>(character & 0xFF), |
||||
|
static_cast<uint8_t>(character >> 8)); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
||||
|
|
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
inline size_t FindFirstCharacter(Vector<const PatternChar> pattern, |
||||
|
Vector<const SubjectChar> subject, size_t index) { |
||||
|
const PatternChar pattern_first_char = pattern[0]; |
||||
|
const size_t max_n = (subject.length() - pattern.length() + 1); |
||||
|
|
||||
|
const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
||||
|
const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); |
||||
|
size_t pos = index; |
||||
|
do { |
||||
|
const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( |
||||
|
memchr(subject.start() + pos, search_byte, |
||||
|
(max_n - pos) * sizeof(SubjectChar))); |
||||
|
if (char_pos == nullptr) |
||||
|
return subject.length(); |
||||
|
char_pos = AlignDown(char_pos, sizeof(SubjectChar)); |
||||
|
pos = static_cast<size_t>(char_pos - subject.start()); |
||||
|
if (subject[pos] == search_char) |
||||
|
return pos; |
||||
|
} while (++pos < max_n); |
||||
|
|
||||
|
return subject.length(); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
template <> |
||||
|
inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, |
||||
|
Vector<const uint8_t> subject, |
||||
|
size_t index) { |
||||
|
const uint8_t pattern_first_char = pattern[0]; |
||||
|
const size_t max_n = (subject.length() - pattern.length() + 1); |
||||
|
|
||||
|
const uint8_t* char_pos = reinterpret_cast<const uint8_t*>( |
||||
|
memchr(subject.start() + index, pattern_first_char, max_n - index)); |
||||
|
if (char_pos == nullptr) |
||||
|
return subject.length(); |
||||
|
return static_cast<size_t>(char_pos - subject.start()); |
||||
|
} |
||||
|
|
||||
|
//---------------------------------------------------------------------
|
||||
|
// Single Character Pattern Search Strategy
|
||||
|
//---------------------------------------------------------------------
|
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
size_t StringSearch<PatternChar, SubjectChar>::SingleCharSearch( |
||||
|
StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t index) { |
||||
|
CHECK_EQ(1, search->pattern_.length()); |
||||
|
PatternChar pattern_first_char = search->pattern_[0]; |
||||
|
|
||||
|
if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { |
||||
|
return FindFirstCharacter(search->pattern_, subject, index); |
||||
|
} else { |
||||
|
if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
||||
|
if (exceedsOneByte(pattern_first_char)) { |
||||
|
return -1; |
||||
|
} |
||||
|
} |
||||
|
return FindFirstCharacter(search->pattern_, subject, index); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
//---------------------------------------------------------------------
|
||||
|
// Linear Search Strategy
|
||||
|
//---------------------------------------------------------------------
|
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
inline bool CharCompare(const PatternChar* pattern, |
||||
|
const SubjectChar* subject, |
||||
|
size_t length) { |
||||
|
ASSERT_GT(length, 0); |
||||
|
size_t pos = 0; |
||||
|
do { |
||||
|
if (pattern[pos] != subject[pos]) { |
||||
|
return false; |
||||
|
} |
||||
|
pos++; |
||||
|
} while (pos < length); |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
// Simple linear search for short patterns. Never bails out.
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
size_t StringSearch<PatternChar, SubjectChar>::LinearSearch( |
||||
|
StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t index) { |
||||
|
Vector<const PatternChar> pattern = search->pattern_; |
||||
|
CHECK_GT(pattern.length(), 1); |
||||
|
const size_t pattern_length = pattern.length(); |
||||
|
size_t i = index; |
||||
|
const size_t n = subject.length() - pattern_length; |
||||
|
while (i <= n) { |
||||
|
i = FindFirstCharacter(pattern, subject, i); |
||||
|
if (i == subject.length()) |
||||
|
return subject.length(); |
||||
|
ASSERT_LE(i, n); |
||||
|
i++; |
||||
|
|
||||
|
// Loop extracted to separate function to allow using return to do
|
||||
|
// a deeper break.
|
||||
|
if (CharCompare(pattern.start() + 1, subject.start() + i, |
||||
|
pattern_length - 1)) { |
||||
|
return i - 1; |
||||
|
} |
||||
|
} |
||||
|
return subject.length(); |
||||
|
} |
||||
|
|
||||
|
//---------------------------------------------------------------------
|
||||
|
// Boyer-Moore string search
|
||||
|
//---------------------------------------------------------------------
|
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( |
||||
|
StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index) { |
||||
|
Vector<const PatternChar> pattern = search->pattern_; |
||||
|
const size_t subject_length = subject.length(); |
||||
|
const size_t pattern_length = pattern.length(); |
||||
|
// Only preprocess at most kBMMaxShift last characters of pattern.
|
||||
|
size_t start = search->start_; |
||||
|
|
||||
|
int* bad_char_occurence = search->bad_char_table(); |
||||
|
int* good_suffix_shift = search->good_suffix_shift_table(); |
||||
|
|
||||
|
PatternChar last_char = pattern[pattern_length - 1]; |
||||
|
size_t index = start_index; |
||||
|
// Continue search from i.
|
||||
|
while (index <= subject_length - pattern_length) { |
||||
|
size_t j = pattern_length - 1; |
||||
|
int c; |
||||
|
while (last_char != (c = subject[index + j])) { |
||||
|
int shift = j - CharOccurrence(bad_char_occurence, c); |
||||
|
index += shift; |
||||
|
if (index > subject_length - pattern_length) { |
||||
|
return subject.length(); |
||||
|
} |
||||
|
} |
||||
|
while (j >= 0 && pattern[j] == (c = subject[index + j])) { |
||||
|
if (j == 0) { |
||||
|
return index; |
||||
|
} |
||||
|
j--; |
||||
|
} |
||||
|
if (j < start) { |
||||
|
// we have matched more than our tables allow us to be smart about.
|
||||
|
// Fall back on BMH shift.
|
||||
|
index += pattern_length - 1 - |
||||
|
CharOccurrence(bad_char_occurence, |
||||
|
static_cast<SubjectChar>(last_char)); |
||||
|
} else { |
||||
|
int gs_shift = good_suffix_shift[j + 1]; |
||||
|
int bc_occ = CharOccurrence(bad_char_occurence, c); |
||||
|
int shift = j - bc_occ; |
||||
|
if (gs_shift > shift) { |
||||
|
shift = gs_shift; |
||||
|
} |
||||
|
index += shift; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return subject.length(); |
||||
|
} |
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { |
||||
|
const size_t pattern_length = pattern_.length(); |
||||
|
const PatternChar* pattern = pattern_.start(); |
||||
|
// Only look at the last kBMMaxShift characters of pattern (from start_
|
||||
|
// to pattern_length).
|
||||
|
const size_t start = start_; |
||||
|
const size_t length = pattern_length - start; |
||||
|
|
||||
|
// Biased tables so that we can use pattern indices as table indices,
|
||||
|
// even if we only cover the part of the pattern from offset start.
|
||||
|
int* shift_table = good_suffix_shift_table(); |
||||
|
int* suffix_table = this->suffix_table(); |
||||
|
|
||||
|
// Initialize table.
|
||||
|
for (size_t i = start; i < pattern_length; i++) { |
||||
|
shift_table[i] = length; |
||||
|
} |
||||
|
shift_table[pattern_length] = 1; |
||||
|
suffix_table[pattern_length] = pattern_length + 1; |
||||
|
|
||||
|
if (pattern_length <= start) { |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Find suffixes.
|
||||
|
PatternChar last_char = pattern[pattern_length - 1]; |
||||
|
size_t suffix = pattern_length + 1; |
||||
|
{ |
||||
|
size_t i = pattern_length; |
||||
|
while (i > start) { |
||||
|
PatternChar c = pattern[i - 1]; |
||||
|
while (suffix <= pattern_length && c != pattern[suffix - 1]) { |
||||
|
if (static_cast<size_t>(shift_table[suffix]) == length) { |
||||
|
shift_table[suffix] = suffix - i; |
||||
|
} |
||||
|
suffix = suffix_table[suffix]; |
||||
|
} |
||||
|
suffix_table[--i] = --suffix; |
||||
|
if (suffix == pattern_length) { |
||||
|
// No suffix to extend, so we check against last_char only.
|
||||
|
while ((i > start) && (pattern[i - 1] != last_char)) { |
||||
|
if (static_cast<size_t>(shift_table[pattern_length]) == length) { |
||||
|
shift_table[pattern_length] = pattern_length - i; |
||||
|
} |
||||
|
suffix_table[--i] = pattern_length; |
||||
|
} |
||||
|
if (i > start) { |
||||
|
suffix_table[--i] = --suffix; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
// Build shift table using suffixes.
|
||||
|
if (suffix < pattern_length) { |
||||
|
for (size_t i = start; i <= pattern_length; i++) { |
||||
|
if (static_cast<size_t>(shift_table[i]) == length) { |
||||
|
shift_table[i] = suffix - start; |
||||
|
} |
||||
|
if (i == suffix) { |
||||
|
suffix = suffix_table[suffix]; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
//---------------------------------------------------------------------
|
||||
|
// Boyer-Moore-Horspool string search.
|
||||
|
//---------------------------------------------------------------------
|
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( |
||||
|
StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t start_index) { |
||||
|
Vector<const PatternChar> pattern = search->pattern_; |
||||
|
const size_t subject_length = subject.length(); |
||||
|
const size_t pattern_length = pattern.length(); |
||||
|
int* char_occurrences = search->bad_char_table(); |
||||
|
int64_t badness = -pattern_length; |
||||
|
|
||||
|
// How bad we are doing without a good-suffix table.
|
||||
|
PatternChar last_char = pattern[pattern_length - 1]; |
||||
|
int last_char_shift = |
||||
|
pattern_length - 1 - |
||||
|
CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); |
||||
|
|
||||
|
// Perform search
|
||||
|
size_t index = start_index; // No matches found prior to this index.
|
||||
|
while (index <= subject_length - pattern_length) { |
||||
|
size_t j = pattern_length - 1; |
||||
|
int subject_char; |
||||
|
while (last_char != (subject_char = subject[index + j])) { |
||||
|
int bc_occ = CharOccurrence(char_occurrences, subject_char); |
||||
|
int shift = j - bc_occ; |
||||
|
index += shift; |
||||
|
badness += 1 - shift; // at most zero, so badness cannot increase.
|
||||
|
if (index > subject_length - pattern_length) { |
||||
|
return subject_length; |
||||
|
} |
||||
|
} |
||||
|
j--; |
||||
|
while (j >= 0 && pattern[j] == (subject[index + j])) { |
||||
|
if (j == 0) { |
||||
|
return index; |
||||
|
} |
||||
|
j--; |
||||
|
} |
||||
|
index += last_char_shift; |
||||
|
// Badness increases by the number of characters we have
|
||||
|
// checked, and decreases by the number of characters we
|
||||
|
// can skip by shifting. It's a measure of how we are doing
|
||||
|
// compared to reading each character exactly once.
|
||||
|
badness += (pattern_length - j) - last_char_shift; |
||||
|
if (badness > 0) { |
||||
|
search->PopulateBoyerMooreTable(); |
||||
|
search->strategy_ = &BoyerMooreSearch; |
||||
|
return BoyerMooreSearch(search, subject, index); |
||||
|
} |
||||
|
} |
||||
|
return subject.length(); |
||||
|
} |
||||
|
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { |
||||
|
const size_t pattern_length = pattern_.length(); |
||||
|
|
||||
|
int* bad_char_occurrence = bad_char_table(); |
||||
|
|
||||
|
// Only preprocess at most kBMMaxShift last characters of pattern.
|
||||
|
const size_t start = start_; |
||||
|
// Run forwards to populate bad_char_table, so that *last* instance
|
||||
|
// of character equivalence class is the one registered.
|
||||
|
// Notice: Doesn't include the last character.
|
||||
|
const size_t table_size = AlphabetSize(); |
||||
|
if (start == 0) { |
||||
|
// All patterns less than kBMMaxShift in length.
|
||||
|
memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); |
||||
|
} else { |
||||
|
for (size_t i = 0; i < table_size; i++) { |
||||
|
bad_char_occurrence[i] = start - 1; |
||||
|
} |
||||
|
} |
||||
|
for (size_t i = start; i < pattern_length - 1; i++) { |
||||
|
PatternChar c = pattern_[i]; |
||||
|
int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); |
||||
|
bad_char_occurrence[bucket] = i; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
//---------------------------------------------------------------------
|
||||
|
// Linear string search with bailout to BMH.
|
||||
|
//---------------------------------------------------------------------
|
||||
|
|
||||
|
// Simple linear search for short patterns, which bails out if the string
|
||||
|
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
|
||||
|
template <typename PatternChar, typename SubjectChar> |
||||
|
size_t StringSearch<PatternChar, SubjectChar>::InitialSearch( |
||||
|
StringSearch<PatternChar, SubjectChar>* search, |
||||
|
Vector<const SubjectChar> subject, |
||||
|
size_t index) { |
||||
|
Vector<const PatternChar> pattern = search->pattern_; |
||||
|
const size_t pattern_length = pattern.length(); |
||||
|
// Badness is a count of how much work we have done. When we have
|
||||
|
// done enough work we decide it's probably worth switching to a better
|
||||
|
// algorithm.
|
||||
|
int64_t badness = -10 - (pattern_length << 2); |
||||
|
|
||||
|
// We know our pattern is at least 2 characters, we cache the first so
|
||||
|
// the common case of the first character not matching is faster.
|
||||
|
for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { |
||||
|
badness++; |
||||
|
if (badness <= 0) { |
||||
|
i = FindFirstCharacter(pattern, subject, i); |
||||
|
if (i == subject.length()) |
||||
|
return subject.length(); |
||||
|
ASSERT_LE(i, n); |
||||
|
size_t j = 1; |
||||
|
do { |
||||
|
if (pattern[j] != subject[i + j]) { |
||||
|
break; |
||||
|
} |
||||
|
j++; |
||||
|
} while (j < pattern_length); |
||||
|
if (j == pattern_length) { |
||||
|
return i; |
||||
|
} |
||||
|
badness += j; |
||||
|
} else { |
||||
|
search->PopulateBoyerMooreHorspoolTable(); |
||||
|
search->strategy_ = &BoyerMooreHorspoolSearch; |
||||
|
return BoyerMooreHorspoolSearch(search, subject, i); |
||||
|
} |
||||
|
} |
||||
|
return subject.length(); |
||||
|
} |
||||
|
|
||||
|
// Perform a a single stand-alone search.
|
||||
|
// If searching multiple times for the same pattern, a search
|
||||
|
// object should be constructed once and the Search function then called
|
||||
|
// for each search.
|
||||
|
template <typename SubjectChar, typename PatternChar> |
||||
|
size_t SearchString(Vector<const SubjectChar> subject, |
||||
|
Vector<const PatternChar> pattern, |
||||
|
size_t start_index) { |
||||
|
StringSearch<PatternChar, SubjectChar> search(pattern); |
||||
|
return search.Search(subject, start_index); |
||||
|
} |
||||
|
} |
||||
|
} // namespace node::stringsearch
|
||||
|
|
||||
|
namespace node { |
||||
|
using node::stringsearch::Vector; |
||||
|
|
||||
|
template <typename SubjectChar, typename PatternChar> |
||||
|
size_t SearchString(const SubjectChar* haystack, |
||||
|
size_t haystack_length, |
||||
|
const PatternChar* needle, |
||||
|
size_t needle_length, |
||||
|
size_t start_index) { |
||||
|
return node::stringsearch::SearchString( |
||||
|
Vector<const SubjectChar>(haystack, haystack_length), |
||||
|
Vector<const PatternChar>(needle, needle_length), |
||||
|
start_index); |
||||
|
} |
||||
|
} // namespace node
|
||||
|
|
||||
|
#endif // SRC_STRING_SEARCH_H_
|
Loading…
Reference in new issue