Author: [EMAIL PROTECTED]
Date: Tue Oct  7 06:25:49 2008
New Revision: 463

Modified:
    branches/bleeding_edge/src/jsregexp.cc
    branches/bleeding_edge/src/runtime.cc
    branches/bleeding_edge/src/runtime.h

Log:
KMP algorithm is still left in the source. If this change checks out to be  
faster, it should be removed.


Modified: branches/bleeding_edge/src/jsregexp.cc
==============================================================================
--- branches/bleeding_edge/src/jsregexp.cc      (original)
+++ branches/bleeding_edge/src/jsregexp.cc      Tue Oct  7 06:25:49 2008
@@ -218,7 +218,7 @@
    }

    LOG(RegExpExecEvent(re, start_index, subject));
-  int value = Runtime::StringMatchKmp(subject, needle, start_index);
+  int value = Runtime::StringMatch(subject, needle, start_index);
    if (value == -1) return Factory::null_value();
    Handle<JSArray> result = Factory::NewJSArray(2);
    SetElement(result, 0, Handle<Smi>(Smi::FromInt(value)));
@@ -239,7 +239,7 @@
      LOG(RegExpExecEvent(re, index, subject));
      int value = -1;
      if (index + needle_length <= subject_length) {
-      value = Runtime::StringMatchKmp(subject, needle, index);
+      value = Runtime::StringMatch(subject, needle, index);
      }
      if (value == -1) break;
      HandleScope scope;

Modified: branches/bleeding_edge/src/runtime.cc
==============================================================================
--- branches/bleeding_edge/src/runtime.cc       (original)
+++ branches/bleeding_edge/src/runtime.cc       Tue Oct  7 06:25:49 2008
@@ -1053,6 +1053,64 @@
    return -1;
  }

+// Maximal length (+1) of suffix that is indexed. Also the size of the
+// maximal bad-character skip.
+static const int kBMHSignificantSuffixLength = 0xff;
+
+// Significant bits taken from characters to use in bad-character
+// skips, to reduce size of the table for Unicode letters.
+static const int kBMHSignificantBitsMask = 0xff;
+
+// Number of elements in bad-char table.
+static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;
+
+// Simplified Boyer-Moore string matching. Only uses bad-char skipping,
+// and restricts table to a suffix of long strings (also restricting
+// the maximum possible skip-length) in order to reduce space.
+template <typename schar, typename pchar>
+static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,
+                                     Vector<const pchar> pattern,
+                                     int start_index) {
+  ASSERT(kBMHSignificantSuffixLength < 0x100);  // We can use bytes as  
skips.
+  static byte bad_char_map[kBMHBadCharCount];
+
+  int m = pattern.length();
+  int n = subject.length();
+  // Cap bad char table to last p chars of pattern. Also max skip value.
+  int p = m < kBMHSignificantSuffixLength ?  m :  
kBMHSignificantSuffixLength;
+
+  memset(bad_char_map, p, kBMHBadCharCount);
+
+  // Run forwards to populate bad_char_table, so that *last* instance
+  // of character equivalence class is the one registered.
+  // Notice: Doesn't include last character.
+  for (int i = p < m ? m - p : 0; i < m - 1; i++) {
+    pchar c = pattern[i];
+    if (sizeof(schar) == 1 && c > 255) return -1;
+    bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;
+  }
+
+  for (int i = start_index + m - 1, j = m - 1; i < n;) {
+    schar c = subject[i];
+    if (c == pattern[j]) {
+      if (j == 0) {
+        return i;
+      }
+      j--;
+      i--;
+    } else {
+      int skip = bad_char_map[c & kBMHSignificantBitsMask];
+      if (skip < (m - j)) {
+        skip = m - j;
+      }
+      i += skip;
+      j = m - 1;
+    }
+  }
+  return -1;
+}
+
+
  // Full KMP pattern match.
  template <typename schar, typename pchar>  // Pattern & subject char types
  static int KMPIndexOf(Vector<const schar> subject,
@@ -1101,35 +1159,40 @@

  // Dispatch to different algorithms for different length of pattern/subject
  template <typename schar, typename pchar>
-static int StringMatchKMP(Vector<const schar> sub,
-                          Vector<const pchar> pat,
-                          int start_index) {
+static int StringMatchStrategy(Vector<const schar> sub,
+                               Vector<const pchar> pat,
+                               int start_index) {
+  int pattern_length = pat.length();
    // Searching for one specific character is common.  For one
    // character patterns the KMP algorithm is guaranteed to slow down
    // the search, so we just run through the subject string.
-  if (pat.length() == 1) {
+  if (pattern_length == 1) {
      return SingleCharIndexOf(sub, pat[0], start_index);
    }

-  // For small searches, KMP is not worth the setup overhead.
-  if (sub.length() - start_index < 100) {
+  // For small searches, a complex sort is not worth the setup overhead.
+  if (sub.length() - start_index < 25) {
      return SimpleIndexOf(sub, pat, start_index);
    }

-  // For patterns with a larger length we use the KMP algorithm.
-  return KMPIndexOf(sub, pat, start_index);
+  return BoyerMooreHorspoolIndexOf(sub, pat, start_index);
  }

  // Perform string match of pattern on subject, starting at start index.
  // Caller must ensure that 0 <= start_index <= sub->length(),
  // and should check that pat->length() + start_index <= sub->length()
-int Runtime::StringMatchKmp(Handle<String> sub,
-                            Handle<String> pat,
-                            int start_index) {
+int Runtime::StringMatch(Handle<String> sub,
+                         Handle<String> pat,
+                         int start_index) {
    ASSERT(0 <= start_index);
    ASSERT(start_index <= sub->length());

-  if (pat->length() == 0) return start_index;
+  int pattern_length = pat->length();
+  if (pattern_length == 0) return start_index;
+
+  int subject_length = sub->length();
+  if (start_index + pattern_length > subject_length) return -1;
+
    FlattenString(sub);
    FlattenString(pat);

@@ -1138,15 +1201,15 @@
    if (pat->is_ascii()) {
      Vector<const char> pat_vector = ToAsciiVector(*pat);
      if (sub->is_ascii()) {
-      return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+      return StringMatchStrategy(ToAsciiVector(*sub), pat_vector,  
start_index);
      }
-    return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+    return StringMatchStrategy(ToUC16Vector(*sub), pat_vector,  
start_index);
    }
    Vector<const uc16> pat_vector = ToUC16Vector(*pat);
    if (sub->is_ascii()) {
-    return StringMatchKMP(ToAsciiVector(*sub), pat_vector, start_index);
+    return StringMatchStrategy(ToAsciiVector(*sub), pat_vector,  
start_index);
    }
-  return StringMatchKMP(ToUC16Vector(*sub), pat_vector, start_index);
+  return StringMatchStrategy(ToUC16Vector(*sub), pat_vector, start_index);
  }


@@ -1161,7 +1224,7 @@
    uint32_t start_index;
    if (!Array::IndexFromObject(index, &start_index)) return  
Smi::FromInt(-1);

-  int position = Runtime::StringMatchKmp(sub, pat, start_index);
+  int position = Runtime::StringMatch(sub, pat, start_index);
    return Smi::FromInt(position);
  }


Modified: branches/bleeding_edge/src/runtime.h
==============================================================================
--- branches/bleeding_edge/src/runtime.h        (original)
+++ branches/bleeding_edge/src/runtime.h        Tue Oct  7 06:25:49 2008
@@ -335,7 +335,7 @@
    // Get the runtime function with the given name.
    static Function* FunctionForName(const char* name);

-  static int StringMatchKmp(Handle<String> sub, Handle<String> pat, int  
index);
+  static int StringMatch(Handle<String> sub, Handle<String> pat, int  
index);

    // TODO(1240886): The following three methods are *not* handle safe,
    // but accept handle arguments. This seems fragile.

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to