Skip to content

Commit 6c1e5ad

Browse files
dcposchjasnell
authored andcommitted
buffer: add Buffer.prototype.lastIndexOf()
* Remove unnecessary templating from SearchString SearchString used to have separate PatternChar and SubjectChar template type arguments, apparently to support things like searching for an 8-bit string inside a 16-bit string or vice versa. However, SearchString is only used from node_buffer.cc, where PatternChar and SubjectChar are always the same. Since this is extra complexity that's unused and untested (simplifying to a single Char template argument still compiles and didn't break any unit tests), I removed it. * Use Boyer-Hoore[-Horspool] for both indexOf and lastIndexOf Add test cases for lastIndexOf. Test the fallback from BMH to Boyer-Moore, which looks like it was totally untested before. * Extra bounds checks in node_buffer.cc * Extra asserts in string_search.h * Buffer.lastIndexOf: clean up, enforce consistency w/ String.lastIndexOf * Polyfill memrchr(3) for non-GNU systems PR-URL: #4846 Reviewed-By: James M Snell <[email protected]> Reviewed-By: Trevor Norris <[email protected]>
1 parent d5922bd commit 6c1e5ad

File tree

5 files changed

+471
-252
lines changed

5 files changed

+471
-252
lines changed

doc/api/buffer.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -988,6 +988,46 @@ for (var key of buf.keys()) {
988988
// 5
989989
```
990990

991+
### buf.lastIndexOf(value[, byteOffset][, encoding])
992+
993+
* `value` {String|Buffer|Number}
994+
* `byteOffset` {Number} Default: `buf.length`
995+
* `encoding` {String} Default: `'utf8'`
996+
* Return: {Number}
997+
998+
Identical to [`Buffer#indexOf()`][], but searches the Buffer from back to front
999+
instead of front to back. Returns the starting index position of `value` in
1000+
Buffer or `-1` if the Buffer does not contain `value`. The `value` can be a
1001+
String, Buffer or Number. Strings are by default interpreted as UTF8. If
1002+
`byteOffset` is provided, will return the last match that begins at or before
1003+
`byteOffset`.
1004+
1005+
```js
1006+
const buf = new Buffer('this buffer is a buffer');
1007+
1008+
buf.lastIndexOf('this');
1009+
// returns 0
1010+
buf.lastIndexOf('buffer');
1011+
// returns 17
1012+
buf.lastIndexOf(new Buffer('buffer'));
1013+
// returns 17
1014+
buf.lastIndexOf(97); // ascii for 'a'
1015+
// returns 15
1016+
buf.lastIndexOf(new Buffer('yolo'));
1017+
// returns -1
1018+
buf.lastIndexOf('buffer', 5)
1019+
// returns 5
1020+
buf.lastIndexOf('buffer', 4)
1021+
// returns -1
1022+
1023+
const utf16Buffer = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2');
1024+
1025+
utf16Buffer.lastIndexOf('\u03a3', null, 'ucs2');
1026+
// returns 6
1027+
utf16Buffer.lastIndexOf('\u03a3', -5, 'ucs2');
1028+
// returns 4
1029+
```
1030+
9911031
### buf.length
9921032

9931033
* {Number}

lib/buffer.js

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,48 @@ Buffer.prototype.compare = function compare(target,
598598
return binding.compareOffset(this, target, start, thisStart, end, thisEnd);
599599
};
600600

601-
function slowIndexOf(buffer, val, byteOffset, encoding) {
601+
602+
// Finds either the first index of `val` in `buffer` at offset >= `byteOffset`,
603+
// OR the last index of `val` in `buffer` at offset <= `byteOffset`.
604+
//
605+
// Arguments:
606+
// - buffer - a Buffer to search
607+
// - val - a string, Buffer, or number
608+
// - byteOffset - an index into `buffer`; will be clamped to an int32
609+
// - encoding - an optional encoding, relevant is val is a string
610+
// - dir - true for indexOf, false for lastIndexOf
611+
function bidirectionalIndexOf(buffer, val, byteOffset, encoding, dir) {
612+
if (typeof byteOffset === 'string') {
613+
encoding = byteOffset;
614+
byteOffset = undefined;
615+
} else if (byteOffset > 0x7fffffff) {
616+
byteOffset = 0x7fffffff;
617+
} else if (byteOffset < -0x80000000) {
618+
byteOffset = -0x80000000;
619+
}
620+
byteOffset = +byteOffset; // Coerce to Number.
621+
if (isNaN(byteOffset)) {
622+
// If the offset is undefined, null, NaN, "foo", etc, search whole buffer.
623+
byteOffset = dir ? 0 : (buffer.length - 1);
624+
}
625+
dir = !!dir; // Cast to bool.
626+
627+
if (typeof val === 'string') {
628+
if (encoding === undefined) {
629+
return binding.indexOfString(buffer, val, byteOffset, encoding, dir);
630+
}
631+
return slowIndexOf(buffer, val, byteOffset, encoding, dir);
632+
} else if (val instanceof Buffer) {
633+
return binding.indexOfBuffer(buffer, val, byteOffset, encoding, dir);
634+
} else if (typeof val === 'number') {
635+
return binding.indexOfNumber(buffer, val, byteOffset, dir);
636+
}
637+
638+
throw new TypeError('"val" argument must be string, number or Buffer');
639+
}
640+
641+
642+
function slowIndexOf(buffer, val, byteOffset, encoding, dir) {
602643
var loweredCase = false;
603644
for (;;) {
604645
switch (encoding) {
@@ -609,13 +650,13 @@ function slowIndexOf(buffer, val, byteOffset, encoding) {
609650
case 'utf16le':
610651
case 'utf-16le':
611652
case 'binary':
612-
return binding.indexOfString(buffer, val, byteOffset, encoding);
653+
return binding.indexOfString(buffer, val, byteOffset, encoding, dir);
613654

614655
case 'base64':
615656
case 'ascii':
616657
case 'hex':
617658
return binding.indexOfBuffer(
618-
buffer, Buffer.from(val, encoding), byteOffset, encoding);
659+
buffer, Buffer.from(val, encoding), byteOffset, encoding, dir);
619660

620661
default:
621662
if (loweredCase) {
@@ -628,29 +669,14 @@ function slowIndexOf(buffer, val, byteOffset, encoding) {
628669
}
629670
}
630671

672+
631673
Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
632-
if (typeof byteOffset === 'string') {
633-
encoding = byteOffset;
634-
byteOffset = 0;
635-
} else if (byteOffset > 0x7fffffff) {
636-
byteOffset = 0x7fffffff;
637-
} else if (byteOffset < -0x80000000) {
638-
byteOffset = -0x80000000;
639-
}
640-
byteOffset >>= 0;
674+
return bidirectionalIndexOf(this, val, byteOffset, encoding, true);
675+
};
641676

642-
if (typeof val === 'string') {
643-
if (encoding === undefined) {
644-
return binding.indexOfString(this, val, byteOffset, encoding);
645-
}
646-
return slowIndexOf(this, val, byteOffset, encoding);
647-
} else if (val instanceof Buffer) {
648-
return binding.indexOfBuffer(this, val, byteOffset, encoding);
649-
} else if (typeof val === 'number') {
650-
return binding.indexOfNumber(this, val, byteOffset);
651-
}
652677

653-
throw new TypeError('"val" argument must be string, number or Buffer');
678+
Buffer.prototype.lastIndexOf = function lastIndexOf(val, byteOffset, encoding) {
679+
return bidirectionalIndexOf(this, val, byteOffset, encoding, false);
654680
};
655681

656682

src/node_buffer.cc

Lines changed: 80 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -943,9 +943,44 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
943943
}
944944

945945

946+
// Computes the offset for starting an indexOf or lastIndexOf search.
947+
// Returns either a valid offset in [0...<length - 1>], ie inside the Buffer,
948+
// or -1 to signal that there is no possible match.
949+
int64_t IndexOfOffset(size_t length, int64_t offset_i64, bool is_forward) {
950+
int64_t length_i64 = static_cast<int64_t>(length);
951+
if (length_i64 == 0) {
952+
// Empty buffer, no match.
953+
return -1;
954+
}
955+
if (offset_i64 < 0) {
956+
if (offset_i64 + length_i64 >= 0) {
957+
// Negative offsets count backwards from the end of the buffer.
958+
return length_i64 + offset_i64;
959+
} else if (is_forward) {
960+
// indexOf from before the start of the buffer: search the whole buffer.
961+
return 0;
962+
} else {
963+
// lastIndexOf from before the start of the buffer: no match.
964+
return -1;
965+
}
966+
} else {
967+
if (offset_i64 < length_i64) {
968+
// Valid positive offset.
969+
return offset_i64;
970+
} else if (is_forward) {
971+
// indexOf from past the end of the buffer: no match.
972+
return -1;
973+
} else {
974+
// lastIndexOf from past the end of the buffer: search the whole buffer.
975+
return length_i64 - 1;
976+
}
977+
}
978+
}
979+
946980
void IndexOfString(const FunctionCallbackInfo<Value>& args) {
947981
ASSERT(args[1]->IsString());
948982
ASSERT(args[2]->IsNumber());
983+
ASSERT(args[4]->IsBoolean());
949984

950985
enum encoding enc = ParseEncoding(args.GetIsolate(),
951986
args[3],
@@ -955,31 +990,26 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
955990
SPREAD_ARG(args[0], ts_obj);
956991

957992
Local<String> needle = args[1].As<String>();
993+
int64_t offset_i64 = args[2]->IntegerValue();
994+
bool is_forward = args[4]->IsTrue();
995+
958996
const char* haystack = ts_obj_data;
959997
const size_t haystack_length = ts_obj_length;
960998
// Extended latin-1 characters are 2 bytes in Utf8.
961999
const size_t needle_length =
9621000
enc == BINARY ? needle->Length() : needle->Utf8Length();
9631001

964-
9651002
if (needle_length == 0 || haystack_length == 0) {
9661003
return args.GetReturnValue().Set(-1);
9671004
}
9681005

969-
int64_t offset_i64 = args[2]->IntegerValue();
970-
size_t offset = 0;
971-
972-
if (offset_i64 < 0) {
973-
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
974-
offset = 0;
975-
} else {
976-
offset = static_cast<size_t>(haystack_length + offset_i64);
977-
}
978-
} else {
979-
offset = static_cast<size_t>(offset_i64);
1006+
int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward);
1007+
if (opt_offset <= -1) {
1008+
return args.GetReturnValue().Set(-1);
9801009
}
981-
982-
if (haystack_length < offset || needle_length + offset > haystack_length) {
1010+
size_t offset = static_cast<size_t>(opt_offset);
1011+
CHECK_LT(offset, haystack_length);
1012+
if (is_forward && needle_length + offset > haystack_length) {
9831013
return args.GetReturnValue().Set(-1);
9841014
}
9851015

@@ -1007,13 +1037,15 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10071037
haystack_length / 2,
10081038
decoded_string,
10091039
decoder.size() / 2,
1010-
offset / 2);
1040+
offset / 2,
1041+
is_forward);
10111042
} else {
10121043
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
10131044
haystack_length / 2,
10141045
reinterpret_cast<const uint16_t*>(*needle_value),
10151046
needle_value.length(),
1016-
offset / 2);
1047+
offset / 2,
1048+
is_forward);
10171049
}
10181050
result *= 2;
10191051
} else if (enc == UTF8) {
@@ -1025,7 +1057,8 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10251057
haystack_length,
10261058
reinterpret_cast<const uint8_t*>(*needle_value),
10271059
needle_length,
1028-
offset);
1060+
offset,
1061+
is_forward);
10291062
} else if (enc == BINARY) {
10301063
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
10311064
if (needle_data == nullptr) {
@@ -1038,7 +1071,8 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10381071
haystack_length,
10391072
needle_data,
10401073
needle_length,
1041-
offset);
1074+
offset,
1075+
is_forward);
10421076
free(needle_data);
10431077
}
10441078

@@ -1049,17 +1083,18 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
10491083
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
10501084
ASSERT(args[1]->IsObject());
10511085
ASSERT(args[2]->IsNumber());
1086+
ASSERT(args[4]->IsBoolean());
10521087

10531088
enum encoding enc = ParseEncoding(args.GetIsolate(),
10541089
args[3],
10551090
UTF8);
10561091

10571092
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
1093+
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[1]);
10581094
SPREAD_ARG(args[0], ts_obj);
10591095
SPREAD_ARG(args[1], buf);
1060-
1061-
if (buf_length > 0)
1062-
CHECK_NE(buf_data, nullptr);
1096+
int64_t offset_i64 = args[2]->IntegerValue();
1097+
bool is_forward = args[4]->IsTrue();
10631098

10641099
const char* haystack = ts_obj_data;
10651100
const size_t haystack_length = ts_obj_length;
@@ -1070,19 +1105,13 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
10701105
return args.GetReturnValue().Set(-1);
10711106
}
10721107

1073-
int64_t offset_i64 = args[2]->IntegerValue();
1074-
size_t offset = 0;
1075-
1076-
if (offset_i64 < 0) {
1077-
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
1078-
offset = 0;
1079-
else
1080-
offset = static_cast<size_t>(haystack_length + offset_i64);
1081-
} else {
1082-
offset = static_cast<size_t>(offset_i64);
1108+
int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward);
1109+
if (opt_offset <= -1) {
1110+
return args.GetReturnValue().Set(-1);
10831111
}
1084-
1085-
if (haystack_length < offset || needle_length + offset > haystack_length) {
1112+
size_t offset = static_cast<size_t>(opt_offset);
1113+
CHECK_LT(offset, haystack_length);
1114+
if (is_forward && needle_length + offset > haystack_length) {
10861115
return args.GetReturnValue().Set(-1);
10871116
}
10881117

@@ -1097,15 +1126,17 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
10971126
haystack_length / 2,
10981127
reinterpret_cast<const uint16_t*>(needle),
10991128
needle_length / 2,
1100-
offset / 2);
1129+
offset / 2,
1130+
is_forward);
11011131
result *= 2;
11021132
} else {
11031133
result = SearchString(
11041134
reinterpret_cast<const uint8_t*>(haystack),
11051135
haystack_length,
11061136
reinterpret_cast<const uint8_t*>(needle),
11071137
needle_length,
1108-
offset);
1138+
offset,
1139+
is_forward);
11091140
}
11101141

11111142
args.GetReturnValue().Set(
@@ -1115,28 +1146,29 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
11151146
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
11161147
ASSERT(args[1]->IsNumber());
11171148
ASSERT(args[2]->IsNumber());
1149+
ASSERT(args[3]->IsBoolean());
11181150

11191151
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
11201152
SPREAD_ARG(args[0], ts_obj);
11211153

11221154
uint32_t needle = args[1]->Uint32Value();
11231155
int64_t offset_i64 = args[2]->IntegerValue();
1124-
size_t offset;
1125-
1126-
if (offset_i64 < 0) {
1127-
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
1128-
offset = 0;
1129-
else
1130-
offset = static_cast<size_t>(ts_obj_length + offset_i64);
1131-
} else {
1132-
offset = static_cast<size_t>(offset_i64);
1133-
}
1156+
bool is_forward = args[3]->IsTrue();
11341157

1135-
if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
1158+
int64_t opt_offset = IndexOfOffset(ts_obj_length, offset_i64, is_forward);
1159+
if (opt_offset <= -1) {
11361160
return args.GetReturnValue().Set(-1);
1161+
}
1162+
size_t offset = static_cast<size_t>(opt_offset);
1163+
CHECK_LT(offset, ts_obj_length);
11371164

1138-
void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
1139-
char* ptr_char = static_cast<char*>(ptr);
1165+
const void* ptr;
1166+
if (is_forward) {
1167+
ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
1168+
} else {
1169+
ptr = node::stringsearch::MemrchrFill(ts_obj_data, needle, offset + 1);
1170+
}
1171+
const char* ptr_char = static_cast<const char*>(ptr);
11401172
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
11411173
: -1);
11421174
}

0 commit comments

Comments
 (0)