Author: [EMAIL PROTECTED]
Date: Tue Oct 7 01:11:44 2008
New Revision: 452
Modified:
branches/bleeding_edge/src/jsregexp.cc
branches/bleeding_edge/src/objects-inl.h
branches/bleeding_edge/src/objects.h
branches/bleeding_edge/src/runtime.cc
branches/bleeding_edge/src/runtime.h
Log:
Fast direct-access version of KPM string match.
Modified: branches/bleeding_edge/src/jsregexp.cc
==============================================================================
--- branches/bleeding_edge/src/jsregexp.cc (original)
+++ branches/bleeding_edge/src/jsregexp.cc Tue Oct 7 01:11:44 2008
@@ -218,7 +218,7 @@
}
LOG(RegExpExecEvent(re, start_index, subject));
- int value = Runtime::StringMatchKmp(*subject, *needle, start_index);
+ int value = Runtime::StringMatchKmp(subject, needle, start_index);
if (value == -1) return Factory::null_value();
Handle<JSArray> result = Factory::NewJSArray(2);
SetElement(result, 0, Handle<Smi>(Smi::FromInt(value)));
@@ -231,13 +231,16 @@
Handle<String> subject) {
Handle<String> needle(String::cast(re->data()));
Handle<JSArray> result = Factory::NewJSArray(1);
- bool keep_going = true;
int index = 0;
int match_count = 0;
+ int subject_length = subject->length();
int needle_length = needle->length();
- while (keep_going) {
+ while (true) {
LOG(RegExpExecEvent(re, index, subject));
- int value = Runtime::StringMatchKmp(*subject, *needle, index);
+ int value = -1;
+ if (index + needle_length <= subject_length) {
+ value = Runtime::StringMatchKmp(subject, needle, index);
+ }
if (value == -1) break;
HandleScope scope;
int end = value + needle_length;
Modified: branches/bleeding_edge/src/objects-inl.h
==============================================================================
--- branches/bleeding_edge/src/objects-inl.h (original)
+++ branches/bleeding_edge/src/objects-inl.h Tue Oct 7 01:11:44 2008
@@ -1354,6 +1354,11 @@
}
+Address TwoByteString::GetCharsAddress() {
+ return FIELD_ADDR(this, kHeaderSize);
+}
+
+
uint16_t TwoByteString::TwoByteStringGet(int index) {
ASSERT(index >= 0 && index < length());
return READ_SHORT_FIELD(this, kHeaderSize + index * kShortSize);
Modified: branches/bleeding_edge/src/objects.h
==============================================================================
--- branches/bleeding_edge/src/objects.h (original)
+++ branches/bleeding_edge/src/objects.h Tue Oct 7 01:11:44 2008
@@ -3138,6 +3138,9 @@
inline uint16_t TwoByteStringGet(int index);
inline void TwoByteStringSet(int index, uint16_t value);
+ // Get the address of the characters in this string.
+ inline Address GetCharsAddress();
+
// For regexp code.
const uint16_t* TwoByteStringGetData(unsigned start);
Modified: branches/bleeding_edge/src/runtime.cc
==============================================================================
--- branches/bleeding_edge/src/runtime.cc (original)
+++ branches/bleeding_edge/src/runtime.cc Tue Oct 7 01:11:44 2008
@@ -42,6 +42,7 @@
#include "runtime.h"
#include "scopeinfo.h"
#include "v8threads.h"
+#include "smart-pointer.h"
namespace v8 { namespace internal {
@@ -927,110 +928,217 @@
}
-static inline void ComputeKMPNextTable(String* pattern, int next_table[]) {
+static Vector<const char> ToAsciiVector(String *string) {
+ ASSERT(string->IsAscii());
+ ASSERT(string->IsFlat());
+
+ int offset = 0;
+ int length = string->length();
+ StringRepresentationTag string_tag = string->representation_tag();
+ if (string_tag == kSlicedStringTag) {
+ SlicedString* sliced = SlicedString::cast(string);
+ offset += sliced->start();
+ string = String::cast(sliced->buffer());
+ string_tag = string->representation_tag();
+ } else if (string_tag == kConsStringTag) {
+ ConsString* cons = ConsString::cast(string);
+ ASSERT(String::cast(cons->second())->length() == 0);
+ string = String::cast(cons->first());
+ string_tag = string->representation_tag();
+ }
+ if (string_tag == kSeqStringTag) {
+ AsciiString* seq = AsciiString::cast(string);
+ char* start = reinterpret_cast<char*>(seq->GetCharsAddress());
+ return Vector<const char>(start + offset, length);
+ }
+ ASSERT(string_tag == kExternalStringTag);
+ ExternalAsciiString* ext = ExternalAsciiString::cast(string);
+ const char* start = ext->resource()->data();
+ return Vector<const char>(start + offset, length);
+}
+
+
+static Vector<const uc16> ToUC16Vector(String *string) {
+ ASSERT(string->IsTwoByteString());
+ ASSERT(string->IsFlat());
+
+ int offset = 0;
+ int length = string->length();
+
+ StringRepresentationTag string_tag = string->representation_tag();
+ if (string_tag == kSlicedStringTag) {
+ SlicedString* sliced = SlicedString::cast(string);
+ offset += sliced->start();
+ string = String::cast(sliced->buffer());
+ string_tag = string->representation_tag();
+ } else if (string_tag == kConsStringTag) {
+ ConsString* cons = ConsString::cast(string);
+ ASSERT(String::cast(cons->second())->length() == 0);
+ string = String::cast(cons->first());
+ string_tag = string->representation_tag();
+ }
+ if (string_tag == kSeqStringTag) {
+ TwoByteString* seq = TwoByteString::cast(string);
+ uc16* start = reinterpret_cast<uc16*>(seq->GetCharsAddress());
+ return Vector<const uc16>(start + offset, length);
+ }
+ ASSERT(string_tag == kExternalStringTag);
+ ExternalTwoByteString* ext = ExternalTwoByteString::cast(string);
+ const uc16* start =
+ reinterpret_cast<const uc16*>(ext->resource()->data());
+ return Vector<const uc16>(start + offset, length);
+}
+
+
+template <typename schar, typename pchar>
+static int SingleCharIndexOf(Vector<const schar> string,
+ pchar pattern_char,
+ int start_index) {
+ for (int i = start_index, n = string.length(); i < n; i++) {
+ if (pattern_char == string[i]) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+// Trivial string search for shorter strings.
+template <typename pchar, typename schar>
+static int SimpleIndexOf(Vector<const schar> subject,
+ Vector<const pchar> pattern,
+ int start_index) {
+ int pattern_length = pattern.length();
+ int subject_length = subject.length();
+ // We know our pattern is at least 2 characters, we cache the first so
+ // the common case of the first character not matching is faster.
+ pchar pattern_first_char = pattern[0];
+ for (int i = start_index, n = subject_length - pattern_length; i <= n;
i++) {
+ if (subject[i] != pattern_first_char) continue;
+
+ bool failure = false;
+ for (int j = 1; j < pattern_length; j++) {
+ if (pattern[j] != subject[j+i]) {
+ failure = true;
+ break;
+ }
+ }
+ if (!failure) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+// Full KMP pattern match.
+template <typename schar, typename pchar> // Pattern & subject char types
+static int KMPIndexOf(Vector<const schar> subject,
+ Vector<const pchar> pattern,
+ int start_index) {
+ int subject_length = subject.length();
+ int pattern_length = pattern.length();
+ SmartPointer<int> next_table(NewArray<int>(pattern_length));
+
+ // Compute KMP "next" table
int i = 0;
int j = -1;
next_table[0] = -1;
- Access<StringInputBuffer> buffer(&string_input_buffer);
- buffer->Reset(pattern);
- int length = pattern->length();
- uint16_t p = buffer->GetNext();
- while (i < length - 1) {
- while (j > -1 && p != pattern->Get(j)) {
+ pchar p = pattern[0];
+ while (i < pattern_length - 1) {
+ while (j > -1 && p != pattern[j]) {
j = next_table[j];
}
i++;
j++;
- p = buffer->GetNext();
- if (p == pattern->Get(j)) {
+ p = pattern[i];
+ if (p == pattern[j]) {
next_table[i] = next_table[j];
} else {
next_table[i] = j;
}
}
-}
-
-
-int Runtime::StringMatchKmp(String* sub, String* pat, int start_index) {
- sub->TryFlatten();
- pat->TryFlatten();
-
- int subject_length = sub->length();
- int pattern_length = pat->length();
-
- if (start_index > subject_length) return -1;
- if (pattern_length == 0) return start_index;
-
- // Searching for one specific character is common. For one
- // character patterns the KMP algorithm is guaranteed to slow down
- // the search, so we just run through the subject string.
- if (pattern_length == 1) {
- uint16_t pattern_char = pat->Get(0);
- for (int i = start_index; i < subject_length; i++) {
- if (sub->Get(i) == pattern_char) {
- return i;
- }
- }
- return -1;
- }
-
- // For small searches, KMP is not worth the setup overhead.
- if (subject_length < 100) {
- // We know our pattern is at least 2 characters, we cache the first so
- // the common case of the first character not matching is faster.
- uint16_t pattern_first_char = pat->Get(0);
- for (int i = start_index; i + pattern_length <= subject_length; i++) {
- if (sub->Get(i) != pattern_first_char) continue;
-
- for (int j = 1; j < pattern_length; j++) {
- if (pat->Get(j) != sub->Get(j + i)) break;
- if (j == pattern_length - 1) return i;
- }
- }
- return -1;
- }
- // For patterns with a larger length we use the KMP algorithm.
- //
- // Compute the 'next' table.
- int* next_table = NewArray<int>(pattern_length);
- ComputeKMPNextTable(pat, next_table);
// Search using the 'next' table.
int pattern_index = 0;
- // We would like to use StringInputBuffer here, but it does not have
- // the ability to start anywhere but the first character of a
- // string. It would be nice to have efficient forward-seeking
- // support on StringInputBuffers.
int subject_index = start_index;
while (subject_index < subject_length) {
- uint16_t subject_char = sub->Get(subject_index);
- while (pattern_index > -1 && pat->Get(pattern_index) != subject_char) {
+ schar subject_char = subject[subject_index];
+ while (pattern_index > -1 && pattern[pattern_index] != subject_char) {
pattern_index = next_table[pattern_index];
}
pattern_index++;
subject_index++;
if (pattern_index >= pattern_length) {
- DeleteArray(next_table);
return subject_index - pattern_index;
}
}
- DeleteArray(next_table);
return -1;
}
+// Dispatch to different algorithms for different length of pattern/subject
+template <typename schar, typename pchar>
+static int StringMatchKMP(Vector<const schar> sub,
+ Vector<const pchar> pat,
+ int start_index) {
+ // Searching for one specific character is common. For one
+ // character patterns the KMP algorithm is guaranteed to slow down
+ // the search, so we just run through the subject string.
+ if (pat.length() == 1) {
+ return SingleCharIndexOf(sub, pat[0], start_index);
+ }
+
+ // For small searches, KMP is not worth the setup overhead.
+ if (sub.length() - start_index < 100) {
+ return SimpleIndexOf(sub, pat, start_index);
+ }
+
+ // For patterns with a larger length we use the KMP algorithm.
+ return KMPIndexOf(sub, pat, start_index);
+}
+
+// Perform string match of pattern on subject, starting at start index.
+// Caller must ensure that 0 <= start_index <= sub->length(),
+// and should check that pat->length() + start_index <= sub->length()
+int Runtime::StringMatchKmp(Handle<String> sub,
+ Handle<String> pat,
+ int start_index) {
+ ASSERT(0 <= start_index);
+ ASSERT(start_index <= sub->length());
+
+ if (pat->length() == 0) return start_index;
+ FlattenString(sub);
+ FlattenString(pat);
+
+ AssertNoAllocation no_heap_allocation; // ensure vectors stay valid
+ // dispatch on type of strings
+ if (pat->is_ascii()) {
+ Vector<const char> pat_vector = ToAsciiVector(*pat);
+ if (sub->is_ascii()) {
+ return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+ }
+ return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+ }
+ Vector<const uc16> pat_vector = ToUC16Vector(*pat);
+ if (sub->is_ascii()) {
+ return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+ }
+ return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+}
+
static Object* Runtime_StringIndexOf(Arguments args) {
- NoHandleAllocation ha;
+ HandleScope scope; // create a new handle scope
ASSERT(args.length() == 3);
- CONVERT_CHECKED(String, sub, args[0]);
- CONVERT_CHECKED(String, pat, args[1]);
+ CONVERT_ARG_CHECKED(String, sub, 0);
+ CONVERT_ARG_CHECKED(String, pat, 1);
+
Object* index = args[2];
uint32_t start_index;
if (!Array::IndexFromObject(index, &start_index)) return
Smi::FromInt(-1);
- return Smi::FromInt(Runtime::StringMatchKmp(sub, pat, start_index));
+ int position = Runtime::StringMatchKmp(sub, pat, start_index);
+ return Smi::FromInt(position);
}
Modified: branches/bleeding_edge/src/runtime.h
==============================================================================
--- branches/bleeding_edge/src/runtime.h (original)
+++ branches/bleeding_edge/src/runtime.h Tue Oct 7 01:11:44 2008
@@ -332,7 +332,7 @@
// Get the runtime function with the given name.
static Function* FunctionForName(const char* name);
- static int StringMatchKmp(String* sub, String* pat, int index);
+ static int StringMatchKmp(Handle<String> sub, Handle<String> pat, int
index);
// TODO(1240886): The following three methods are *not* handle safe,
// but accept handle arguments. This seems fragile.
--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---