Reviewers: olehougaard,

Message:
Small code review.

Description:
Changed Boyer-Moore's bad-char table code:
- Reduce it to half size if the pattern is ASCII, saving on
initialization
- If pattern is ASCII and subject is not, any non-ASCII char can cause a
   full pattern-length shift, even if we haven't indexed the entire
pattern.
- Use memset to initialize buffer in the common case where the pattern
is
   shorter than the max significant suffix limit.

Please review this at http://codereview.chromium.org/7621

Affected files:
   M src/runtime.cc


Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index  
bcdd76decaa0f23c795f50853838371d292278c4..bc1152142afade11849fc0dcf0c0ee96fe811ab3
  
100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -965,7 +965,7 @@ static const int kBMMinPatternLength = 5;
  // Holds the two buffers used by Boyer-Moore string search's Good Suffix
  // shift. Only allows the last kBMMaxShift characters of the needle
  // to be indexed.
-class BMGoodSuffixBuffers: public AllStatic {
+class BMGoodSuffixBuffers {
   public:
    BMGoodSuffixBuffers() {}
    inline void init(int needle_length) {
@@ -1005,12 +1005,18 @@ static void  
BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern,
    // Run forwards to populate bad_char_table, so that *last* instance
    // of character equivalence class is the one registered.
    // Notice: Doesn't include the last character.
-  for (int i = 0; i < kBMAlphabetSize; i++) {
-    bad_char_occurence[i] = start - 1;
+  int table_size = (sizeof(pchar) == 1) ? String::kMaxAsciiCharCode + 1
+                                        : kBMAlphabetSize;
+  if (start == 0) {  // All patterns less than kBMMaxShift in length.
+    memset(bad_char_occurence, -1, table_size *  
sizeof(*bad_char_occurence));
+  } else {
+    for (int i = 0; i < table_size; i++) {
+      bad_char_occurence[i] = start - 1;
+    }
    }
    for (int i = start; i < pattern.length() - 1; i++) {
      pchar c = pattern[i];
-    int bucket = c % kBMAlphabetSize;
+    int bucket = (sizeof(pchar) ==1) ? c : c % kBMAlphabetSize;
      bad_char_occurence[bucket] = i;
    }
  }
@@ -1065,6 +1071,19 @@ static void  
BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern,
    }
  }

+template <typename schar, typename pchar>
+static inline int CharOccurence(int char_code) {
+  if (sizeof(schar) == 1) {
+    return bad_char_occurence[char_code];
+  }
+  if (sizeof(pchar) == 1) {
+    if (char_code > String::kMaxAsciiCharCode) {
+      return -1;
+    }
+    return bad_char_occurence[char_code];
+  }
+  return bad_char_occurence[char_code % kBMAlphabetSize];
+}

  // Restricted simplified Boyer-Moore string matching. Restricts tables to a
  // suffix of long pattern strings and handles only equivalence classes
@@ -1090,7 +1109,7 @@ static int BoyerMooreSimplified(Vector<const schar>  
subject,
      int j = m - 1;
      int c;
      while (last_char != (c = subject[idx + j])) {
-      int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
+      int bc_occ = CharOccurence<schar, pchar>(c);
        int shift = j - bc_occ;
        idx += shift;
        badness += 1 - shift;  // at most zero, so badness cannot increase.
@@ -1105,7 +1124,7 @@ static int BoyerMooreSimplified(Vector<const schar>  
subject,
        complete = true;
        return idx;
      } else {
-      int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
+      int bc_occ = CharOccurence<schar, pchar>(c);
        int shift = bc_occ < j ? j - bc_occ : 1;
        idx += shift;
        // Badness increases by the number of characters we have
@@ -1141,7 +1160,7 @@ static int BoyerMooreIndexOf(Vector<const schar>  
subject,
      int j = m - 1;
      schar c;
      while (last_char != (c = subject[idx + j])) {
-      int shift = j - bad_char_occurence[c % kBMAlphabetSize];
+      int shift = j - CharOccurence<schar, pchar>(c);
        idx += shift;
        if (idx > n - m) {
          return -1;
@@ -1155,7 +1174,7 @@ static int BoyerMooreIndexOf(Vector<const schar>  
subject,
        idx += 1;
      } else {
        int gs_shift = bmgs_buffers.shift(j + 1);       // Good suffix shift.
-      int bc_occ = bad_char_occurence[c % kBMAlphabetSize];
+      int bc_occ = CharOccurence<schar, pchar>(c);
        int shift = j - bc_occ;                         // Bad-char shift.
        shift = (gs_shift > shift) ? gs_shift : shift;
        idx += shift;



--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to