Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

src: replace naive search with naive + BMH in Buffer::IndexOf #2539

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions benchmark/buffers/buffer-indexof.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
var common = require('../common.js');
var fs = require('fs');

var bench = common.createBenchmark(main, {
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
'Ou est ma chatte?', 'found it very', 'among mad people',
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
'venture to go near the house till she had brought herself down to',
'</i> to the Caterpillar'],
encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
type: ['buffer', 'string'],
iter: [1]
});

function main(conf) {
var iter = (conf.iter) * 100000;
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
var search = conf.search;
var encoding = conf.encoding;

if (encoding === 'undefined') {
encoding = undefined;
}

if (encoding === 'ucs2') {
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
}

if (conf.type === 'buffer') {
search = new Buffer(new Buffer(search).toString(), encoding);
}

bench.start();
for (var i = 0; i < iter; i++) {
aliceBuffer.indexOf(search, 0, encoding);
}
bench.end(iter);
}
3,867 changes: 3,867 additions & 0 deletions benchmark/fixtures/alice.html

Large diffs are not rendered by default.

45 changes: 39 additions & 6 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -395,20 +395,53 @@ Buffer.prototype.compare = function compare(b) {
return binding.compare(this, b);
};

function slowIndexOf(buffer, val, byteOffset, encoding) {
var loweredCase = false;
for (;;) {
switch (encoding) {
case 'utf8':
case 'utf-8':
case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
case 'binary':
return binding.indexOfString(buffer, val, byteOffset, encoding);

Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
case 'base64':
case 'ascii':
case 'hex':
return binding.indexOfBuffer(
buffer, Buffer(val, encoding), byteOffset, encoding);

default:
if (loweredCase) {
throw new TypeError('Unknown encoding: ' + encoding);
}

encoding = ('' + encoding).toLowerCase();
loweredCase = true;
}
}
}

Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
if (byteOffset > 0x7fffffff)
byteOffset = 0x7fffffff;
else if (byteOffset < -0x80000000)
byteOffset = -0x80000000;
byteOffset >>= 0;

if (typeof val === 'string')
return binding.indexOfString(this, val, byteOffset);
if (val instanceof Buffer)
return binding.indexOfBuffer(this, val, byteOffset);
if (typeof val === 'number')
if (typeof val === 'string') {
if (encoding === undefined) {
return binding.indexOfString(this, val, byteOffset, encoding);
}
return slowIndexOf(this, val, byteOffset, encoding);
} else if (val instanceof Buffer) {
return binding.indexOfBuffer(this, val, byteOffset, encoding);
} else if (typeof val === 'number') {
return binding.indexOfNumber(this, val, byteOffset);
}

throw new TypeError('val must be string, number or Buffer');
};
Expand Down
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@
'src/util.h',
'src/util-inl.h',
'src/util.cc',
'src/string_search.cc',
'deps/http_parser/http_parser.h',
'deps/v8/include/v8.h',
'deps/v8/include/v8-debug.h',
Expand Down
178 changes: 124 additions & 54 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "env.h"
#include "env-inl.h"
#include "string_bytes.h"
#include "string_search.h"
#include "util.h"
#include "util-inl.h"
#include "v8-profiler.h"
Expand Down Expand Up @@ -854,87 +855,156 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
}


int32_t IndexOf(const char* haystack,
size_t h_length,
const char* needle,
size_t n_length) {
CHECK_GE(h_length, n_length);
// TODO(trevnorris): Implement Boyer-Moore string search algorithm.
for (size_t i = 0; i < h_length - n_length + 1; i++) {
if (haystack[i] == needle[0]) {
if (memcmp(haystack + i, needle, n_length) == 0)
return i;
}
}
return -1;
}


void IndexOfString(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsString());
ASSERT(args[2]->IsNumber());

enum encoding enc = ParseEncoding(args.GetIsolate(),
args[3],
UTF8);

THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
SPREAD_ARG(args[0], ts_obj);

node::Utf8Value str(args.GetIsolate(), args[1]);
int32_t offset_i32 = args[2]->Int32Value();
uint32_t offset;
Local<String> needle = args[1].As<String>();
const char* haystack = ts_obj_data;
const size_t haystack_length = ts_obj_length;
const size_t needle_length = needle->Utf8Length();


if (needle_length == 0 || haystack_length == 0) {
return args.GetReturnValue().Set(-1);
}

int64_t offset_i64 = args[2]->IntegerValue();
size_t offset = 0;

if (offset_i32 < 0) {
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
offset = 0;
else
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
} else {
offset = static_cast<size_t>(haystack_length + offset_i64);
}
} else {
offset = static_cast<uint32_t>(offset_i32);
offset = static_cast<size_t>(offset_i64);
}

if (str.length() == 0 ||
ts_obj_length == 0 ||
(offset != 0 && str.length() + offset <= str.length()) ||
str.length() + offset > ts_obj_length)
if (haystack_length < offset || needle_length + offset > haystack_length) {
return args.GetReturnValue().Set(-1);
}

int32_t r =
IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
}
size_t result = haystack_length;

if (enc == UCS2) {
String::Value needle_value(needle);
if (*needle_value == nullptr)
return args.GetReturnValue().Set(-1);

if (haystack_length < 2 || needle_value.length() < 1) {
return args.GetReturnValue().Set(-1);
}

result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
haystack_length / 2,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it a concern if user does this on an odd length buffer?

reinterpret_cast<const uint16_t*>(*needle_value),
needle_value.length(),
offset / 2);
result *= 2;
} else if (enc == UTF8) {
String::Utf8Value needle_value(needle);
if (*needle_value == nullptr)
return args.GetReturnValue().Set(-1);

result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
reinterpret_cast<const uint8_t*>(*needle_value),
needle_length,
offset);
} else if (enc == BINARY) {
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
if (needle_data == nullptr) {
return args.GetReturnValue().Set(-1);
}
needle->WriteOneByte(
needle_data, 0, needle_length, String::NO_NULL_TERMINATION);

result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
needle_data,
needle_length,
offset);
free(needle_data);
}

args.GetReturnValue().Set(
result == haystack_length ? -1 : static_cast<int>(result));
}

void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsObject());
ASSERT(args[2]->IsNumber());

enum encoding enc = ParseEncoding(args.GetIsolate(),
args[3],
UTF8);

THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
SPREAD_ARG(args[0], ts_obj);
SPREAD_ARG(args[1], buf);
const int32_t offset_i32 = args[2]->Int32Value();
uint32_t offset;

if (buf_length > 0)
CHECK_NE(buf_data, nullptr);

if (offset_i32 < 0) {
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
const char* haystack = ts_obj_data;
const size_t haystack_length = ts_obj_length;
const char* needle = buf_data;
const size_t needle_length = buf_length;

if (needle_length == 0 || haystack_length == 0) {
return args.GetReturnValue().Set(-1);
}

int64_t offset_i64 = args[2]->IntegerValue();
size_t offset = 0;

if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
offset = 0;
else
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
offset = static_cast<size_t>(haystack_length + offset_i64);
} else {
offset = static_cast<uint32_t>(offset_i32);
offset = static_cast<size_t>(offset_i64);
}

if (buf_length == 0 ||
ts_obj_length == 0 ||
(offset != 0 && buf_length + offset <= buf_length) ||
buf_length + offset > ts_obj_length)
if (haystack_length < offset || needle_length + offset > haystack_length) {
return args.GetReturnValue().Set(-1);
}

int32_t r =
IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
}
size_t result = haystack_length;

if (enc == UCS2) {
if (haystack_length < 2 || needle_length < 2) {
return args.GetReturnValue().Set(-1);
}
result = SearchString(
reinterpret_cast<const uint16_t*>(haystack),
haystack_length / 2,
reinterpret_cast<const uint16_t*>(needle),
needle_length / 2,
offset / 2);
result *= 2;
} else {
result = SearchString(
reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
reinterpret_cast<const uint8_t*>(needle),
needle_length,
offset);
}

args.GetReturnValue().Set(
result == haystack_length ? -1 : static_cast<int>(result));
}

void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsNumber());
Expand All @@ -944,25 +1014,25 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
SPREAD_ARG(args[0], ts_obj);

uint32_t needle = args[1]->Uint32Value();
int32_t offset_i32 = args[2]->Int32Value();
uint32_t offset;
int64_t offset_i64 = args[2]->IntegerValue();
size_t offset;

if (offset_i32 < 0) {
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
offset = 0;
else
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
offset = static_cast<size_t>(ts_obj_length + offset_i64);
} else {
offset = static_cast<uint32_t>(offset_i32);
offset = static_cast<size_t>(offset_i64);
}

if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
return args.GetReturnValue().Set(-1);

void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
char* ptr_char = static_cast<char*>(ptr);
args.GetReturnValue().Set(
ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
: -1);
}


Expand Down
10 changes: 10 additions & 0 deletions src/string_search.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include "string_search.h"

namespace node {
namespace stringsearch {

int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
}
} // namespace node::stringsearch
Loading