Author: [EMAIL PROTECTED]
Date: Tue Oct  7 01:11:44 2008
New Revision: 452

Modified:
    branches/bleeding_edge/src/jsregexp.cc
    branches/bleeding_edge/src/objects-inl.h
    branches/bleeding_edge/src/objects.h
    branches/bleeding_edge/src/runtime.cc
    branches/bleeding_edge/src/runtime.h

Log:
Fast direct-access version of KPM string match.


Modified: branches/bleeding_edge/src/jsregexp.cc
==============================================================================
--- branches/bleeding_edge/src/jsregexp.cc      (original)
+++ branches/bleeding_edge/src/jsregexp.cc      Tue Oct  7 01:11:44 2008
@@ -218,7 +218,7 @@
    }

    LOG(RegExpExecEvent(re, start_index, subject));
-  int value = Runtime::StringMatchKmp(*subject, *needle, start_index);
+  int value = Runtime::StringMatchKmp(subject, needle, start_index);
    if (value == -1) return Factory::null_value();
    Handle<JSArray> result = Factory::NewJSArray(2);
    SetElement(result, 0, Handle<Smi>(Smi::FromInt(value)));
@@ -231,13 +231,16 @@
                                            Handle<String> subject) {
    Handle<String> needle(String::cast(re->data()));
    Handle<JSArray> result = Factory::NewJSArray(1);
-  bool keep_going = true;
    int index = 0;
    int match_count = 0;
+  int subject_length = subject->length();
    int needle_length = needle->length();
-  while (keep_going) {
+  while (true) {
      LOG(RegExpExecEvent(re, index, subject));
-    int value = Runtime::StringMatchKmp(*subject, *needle, index);
+    int value = -1;
+    if (index + needle_length <= subject_length) {
+      value = Runtime::StringMatchKmp(subject, needle, index);
+    }
      if (value == -1) break;
      HandleScope scope;
      int end = value + needle_length;

Modified: branches/bleeding_edge/src/objects-inl.h
==============================================================================
--- branches/bleeding_edge/src/objects-inl.h    (original)
+++ branches/bleeding_edge/src/objects-inl.h    Tue Oct  7 01:11:44 2008
@@ -1354,6 +1354,11 @@
  }


+Address TwoByteString::GetCharsAddress() {
+  return FIELD_ADDR(this, kHeaderSize);
+}
+
+
  uint16_t TwoByteString::TwoByteStringGet(int index) {
    ASSERT(index >= 0 && index < length());
    return READ_SHORT_FIELD(this, kHeaderSize + index * kShortSize);

Modified: branches/bleeding_edge/src/objects.h
==============================================================================
--- branches/bleeding_edge/src/objects.h        (original)
+++ branches/bleeding_edge/src/objects.h        Tue Oct  7 01:11:44 2008
@@ -3138,6 +3138,9 @@
    inline uint16_t TwoByteStringGet(int index);
    inline void TwoByteStringSet(int index, uint16_t value);

+  // Get the address of the characters in this string.
+  inline Address GetCharsAddress();
+
    // For regexp code.
    const uint16_t* TwoByteStringGetData(unsigned start);


Modified: branches/bleeding_edge/src/runtime.cc
==============================================================================
--- branches/bleeding_edge/src/runtime.cc       (original)
+++ branches/bleeding_edge/src/runtime.cc       Tue Oct  7 01:11:44 2008
@@ -42,6 +42,7 @@
  #include "runtime.h"
  #include "scopeinfo.h"
  #include "v8threads.h"
+#include "smart-pointer.h"

  namespace v8 { namespace internal {

@@ -927,110 +928,217 @@
  }


-static inline void ComputeKMPNextTable(String* pattern, int next_table[]) {
+static Vector<const char> ToAsciiVector(String *string) {
+  ASSERT(string->IsAscii());
+  ASSERT(string->IsFlat());
+
+  int offset = 0;
+  int length = string->length();
+  StringRepresentationTag string_tag = string->representation_tag();
+  if (string_tag == kSlicedStringTag) {
+      SlicedString* sliced = SlicedString::cast(string);
+      offset += sliced->start();
+      string = String::cast(sliced->buffer());
+      string_tag = string->representation_tag();
+  } else if (string_tag == kConsStringTag) {
+      ConsString* cons = ConsString::cast(string);
+      ASSERT(String::cast(cons->second())->length() == 0);
+      string = String::cast(cons->first());
+      string_tag = string->representation_tag();
+  }
+  if (string_tag == kSeqStringTag) {
+    AsciiString* seq = AsciiString::cast(string);
+    char* start = reinterpret_cast<char*>(seq->GetCharsAddress());
+    return Vector<const char>(start + offset, length);
+  }
+  ASSERT(string_tag == kExternalStringTag);
+  ExternalAsciiString* ext = ExternalAsciiString::cast(string);
+  const char* start = ext->resource()->data();
+  return Vector<const char>(start + offset, length);
+}
+
+
+static Vector<const uc16> ToUC16Vector(String *string) {
+  ASSERT(string->IsTwoByteString());
+  ASSERT(string->IsFlat());
+
+  int offset = 0;
+  int length = string->length();
+
+  StringRepresentationTag string_tag = string->representation_tag();
+  if (string_tag == kSlicedStringTag) {
+      SlicedString* sliced = SlicedString::cast(string);
+      offset += sliced->start();
+      string = String::cast(sliced->buffer());
+      string_tag = string->representation_tag();
+  } else if (string_tag == kConsStringTag) {
+      ConsString* cons = ConsString::cast(string);
+      ASSERT(String::cast(cons->second())->length() == 0);
+      string = String::cast(cons->first());
+      string_tag = string->representation_tag();
+  }
+  if (string_tag == kSeqStringTag) {
+    TwoByteString* seq = TwoByteString::cast(string);
+    uc16* start = reinterpret_cast<uc16*>(seq->GetCharsAddress());
+    return Vector<const uc16>(start + offset, length);
+  }
+  ASSERT(string_tag == kExternalStringTag);
+  ExternalTwoByteString* ext = ExternalTwoByteString::cast(string);
+  const uc16* start =
+      reinterpret_cast<const uc16*>(ext->resource()->data());
+  return Vector<const uc16>(start + offset, length);
+}
+
+
+template <typename schar, typename pchar>
+static int SingleCharIndexOf(Vector<const schar> string,
+                             pchar pattern_char,
+                             int start_index) {
+  for (int i = start_index, n = string.length(); i < n; i++) {
+    if (pattern_char == string[i]) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+// Trivial string search for shorter strings.
+template <typename pchar, typename schar>
+static int SimpleIndexOf(Vector<const schar> subject,
+                         Vector<const pchar> pattern,
+                         int start_index) {
+  int pattern_length = pattern.length();
+  int subject_length = subject.length();
+  // We know our pattern is at least 2 characters, we cache the first so
+  // the common case of the first character not matching is faster.
+  pchar pattern_first_char = pattern[0];
+  for (int i = start_index, n = subject_length - pattern_length; i <= n;  
i++) {
+    if (subject[i] != pattern_first_char) continue;
+
+    bool failure = false;
+    for (int j = 1; j < pattern_length; j++) {
+      if (pattern[j] != subject[j+i]) {
+        failure = true;
+        break;
+      }
+    }
+    if (!failure) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+// Full KMP pattern match.
+template <typename schar, typename pchar>  // Pattern & subject char types
+static int KMPIndexOf(Vector<const schar> subject,
+                      Vector<const pchar> pattern,
+                      int start_index) {
+  int subject_length = subject.length();
+  int pattern_length = pattern.length();
+  SmartPointer<int> next_table(NewArray<int>(pattern_length));
+
+  // Compute KMP "next" table
    int i = 0;
    int j = -1;
    next_table[0] = -1;

-  Access<StringInputBuffer> buffer(&string_input_buffer);
-  buffer->Reset(pattern);
-  int length = pattern->length();
-  uint16_t p = buffer->GetNext();
-  while (i < length - 1) {
-    while (j > -1 && p != pattern->Get(j)) {
+  pchar p = pattern[0];
+  while (i < pattern_length - 1) {
+    while (j > -1 && p != pattern[j]) {
        j = next_table[j];
      }
      i++;
      j++;
-    p = buffer->GetNext();
-    if (p == pattern->Get(j)) {
+    p = pattern[i];
+    if (p == pattern[j]) {
        next_table[i] = next_table[j];
      } else {
        next_table[i] = j;
      }
    }
-}
-
-
-int Runtime::StringMatchKmp(String* sub, String* pat, int start_index) {
-  sub->TryFlatten();
-  pat->TryFlatten();
-
-  int subject_length = sub->length();
-  int pattern_length = pat->length();
-
-  if (start_index > subject_length) return -1;
-  if (pattern_length == 0) return start_index;
-
-  // Searching for one specific character is common.  For one
-  // character patterns the KMP algorithm is guaranteed to slow down
-  // the search, so we just run through the subject string.
-  if (pattern_length == 1) {
-    uint16_t pattern_char = pat->Get(0);
-    for (int i = start_index; i < subject_length; i++) {
-      if (sub->Get(i) == pattern_char) {
-        return i;
-      }
-    }
-    return -1;
-  }
-
-  // For small searches, KMP is not worth the setup overhead.
-  if (subject_length < 100) {
-    // We know our pattern is at least 2 characters, we cache the first so
-    // the common case of the first character not matching is faster.
-    uint16_t pattern_first_char = pat->Get(0);
-    for (int i = start_index; i + pattern_length <= subject_length; i++) {
-      if (sub->Get(i) != pattern_first_char) continue;
-
-      for (int j = 1; j < pattern_length; j++) {
-        if (pat->Get(j) != sub->Get(j + i)) break;
-        if (j == pattern_length - 1) return i;
-      }
-    }
-    return -1;
-  }

-  // For patterns with a larger length we use the KMP algorithm.
-  //
-  // Compute the 'next' table.
-  int* next_table = NewArray<int>(pattern_length);
-  ComputeKMPNextTable(pat, next_table);
    // Search using the 'next' table.
    int pattern_index = 0;
-  // We would like to use StringInputBuffer here, but it does not have
-  // the ability to start anywhere but the first character of a
-  // string.  It would be nice to have efficient forward-seeking
-  // support on StringInputBuffers.
    int subject_index = start_index;
    while (subject_index < subject_length) {
-    uint16_t subject_char = sub->Get(subject_index);
-    while (pattern_index > -1 && pat->Get(pattern_index) != subject_char) {
+    schar subject_char = subject[subject_index];
+    while (pattern_index > -1 && pattern[pattern_index] != subject_char) {
        pattern_index = next_table[pattern_index];
      }
      pattern_index++;
      subject_index++;
      if (pattern_index >= pattern_length) {
-      DeleteArray(next_table);
        return subject_index - pattern_index;
      }
    }
-  DeleteArray(next_table);
    return -1;
  }

+// Dispatch to different algorithms for different length of pattern/subject
+template <typename schar, typename pchar>
+static int StringMatchKMP(Vector<const schar> sub,
+                          Vector<const pchar> pat,
+                          int start_index) {
+  // Searching for one specific character is common.  For one
+  // character patterns the KMP algorithm is guaranteed to slow down
+  // the search, so we just run through the subject string.
+  if (pat.length() == 1) {
+    return SingleCharIndexOf(sub, pat[0], start_index);
+  }
+
+  // For small searches, KMP is not worth the setup overhead.
+  if (sub.length() - start_index < 100) {
+    return SimpleIndexOf(sub, pat, start_index);
+  }
+
+  // For patterns with a larger length we use the KMP algorithm.
+  return KMPIndexOf(sub, pat, start_index);
+}
+
+// Perform string match of pattern on subject, starting at start index.
+// Caller must ensure that 0 <= start_index <= sub->length(),
+// and should check that pat->length() + start_index <= sub->length()
+int Runtime::StringMatchKmp(Handle<String> sub,
+                            Handle<String> pat,
+                            int start_index) {
+  ASSERT(0 <= start_index);
+  ASSERT(start_index <= sub->length());
+
+  if (pat->length() == 0) return start_index;
+  FlattenString(sub);
+  FlattenString(pat);
+
+  AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
+  // dispatch on type of strings
+  if (pat->is_ascii()) {
+    Vector<const char> pat_vector = ToAsciiVector(*pat);
+    if (sub->is_ascii()) {
+      return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+    }
+    return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+  }
+  Vector<const uc16> pat_vector = ToUC16Vector(*pat);
+  if (sub->is_ascii()) {
+    return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+  }
+  return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+}
+

  static Object* Runtime_StringIndexOf(Arguments args) {
-  NoHandleAllocation ha;
+  HandleScope scope;  // create a new handle scope
    ASSERT(args.length() == 3);

-  CONVERT_CHECKED(String, sub, args[0]);
-  CONVERT_CHECKED(String, pat, args[1]);
+  CONVERT_ARG_CHECKED(String, sub, 0);
+  CONVERT_ARG_CHECKED(String, pat, 1);
+
    Object* index = args[2];
    uint32_t start_index;
    if (!Array::IndexFromObject(index, &start_index)) return  
Smi::FromInt(-1);

-  return Smi::FromInt(Runtime::StringMatchKmp(sub, pat, start_index));
+  int position = Runtime::StringMatchKmp(sub, pat, start_index);
+  return Smi::FromInt(position);
  }



Modified: branches/bleeding_edge/src/runtime.h
==============================================================================
--- branches/bleeding_edge/src/runtime.h        (original)
+++ branches/bleeding_edge/src/runtime.h        Tue Oct  7 01:11:44 2008
@@ -332,7 +332,7 @@
    // Get the runtime function with the given name.
    static Function* FunctionForName(const char* name);

-  static int StringMatchKmp(String* sub, String* pat, int index);
+  static int StringMatchKmp(Handle<String> sub, Handle<String> pat, int  
index);

    // TODO(1240886): The following three methods are *not* handle safe,
    // but accept handle arguments. This seems fragile.

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to