Reviewers: danno, Michael Starzinger,

Description:
Speedup stringsearch for two byte strings

Uses the lower byte with memchr which is
significantly faster than a naive compare

Performance difference with bench (http://hastebin.com/xuxexataso.js):

old                             new

single character                single character
Κ found at 922                  Κ found at 922
3324                            616
㎡ found at 13217               ㎡ found at 13217
42366                           4931
က found at 4096                 က found at 4096
13369                           9836
＀ found at 65280                ＀ found at 65280
207472                          36149
ᆬ found at 65445                ᆬ found at 65445
209344                          36666
  found at 8197                   found at 8197
26731                           11757
倂 found at 20482               倂 found at 20482
66071                           17193

linear search                   linear search
ΚΛ found at 922                 ΚΛ found at 922
4112                            504
㎡㎢ found at 13217             ㎡㎢ found at 13217
55105                           5119
ᆬᆭ found at 65445               ᆬᆭ found at 65445
268016                          35496

linear + bmh search             linear + bmh search
ΚΛΜΝΞΟΠΡ found at 922           ΚΛΜΝΞΟΠΡ found at 922
2897                            522
ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445         ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445
167687                          158465

I experimented a bit with it for nodejs.
The original committer ([email protected]) or reviewer ([email protected])
of string-search.h did not seem active anymore on v8.

BUG=

Please review this at https://codereview.chromium.org/1303033012/

Base URL: https://chromium.googlesource.com/v8/v8.git@master

Affected files (+38, -33 lines):
  M src/string-search.h


Index: src/string-search.h
diff --git a/src/string-search.h b/src/string-search.h
index 349d4fd2db72918560fd0e784bf9ea4664c803c2..192261655e5aa7c776d3071ca91adeb6fcefd68a 100644
--- a/src/string-search.h
+++ b/src/string-search.h
@@ -189,11 +189,37 @@ class StringSearch : private StringSearchBase {
   int start_;
 };

+template <typename T, typename U>
+inline T AlignDown(T value, U alignment) {
+  return reinterpret_cast<T>(
+      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
+}
+
+template <typename PatternChar, typename SubjectChar>
+inline int FindFirstByte(Vector<const PatternChar> pattern,
+                         Vector<const SubjectChar> subject, int index) {
+  PatternChar pattern_first_char = pattern[0];
+
+  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(memchr(
+ subject.start() + index, pattern_first_char, subject.length() - index));
+    if (pos == NULL) return -1;
+    return static_cast<int>(pos - subject.start());
+  } else {
+ uint8_t search_low_byte = static_cast<uint8_t>(pattern_first_char & 0xFF);
+    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+        memchr(subject.start() + index, search_low_byte,
+               (subject.length() - index) * sizeof(SubjectChar)));
+    if (pos == NULL) return -1;
+    pos = AlignDown(pos, sizeof(SubjectChar));
+    return static_cast<int>(pos - subject.start());
+  }
+  return -1;
+}

 //---------------------------------------------------------------------
 // Single Character Pattern Search Strategy
 //---------------------------------------------------------------------
-
 template <typename PatternChar, typename SubjectChar>
 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
     StringSearch<PatternChar, SubjectChar>* search,
@@ -201,14 +227,8 @@ int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
     int index) {
   DCHECK_EQ(1, search->pattern_.length());
   PatternChar pattern_first_char = search->pattern_[0];
-  int i = index;
   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
-        memchr(subject.start() + i,
-               pattern_first_char,
-               subject.length() - i));
-    if (pos == NULL) return -1;
-    return static_cast<int>(pos - subject.start());
+    return FindFirstByte(search->pattern_, subject, index);
   } else {
     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
       if (exceedsOneByte(pattern_first_char)) {
@@ -216,8 +236,12 @@ int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
       }
     }
     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
-    int n = subject.length();
+    const int n = subject.length();
+    int i = index;
     while (i < n) {
+      i = FindFirstByte(search->pattern_, subject, i);
+      if (i == -1) return -1;
+
       if (subject[i++] == search_char) return i - 1;
     }
     return -1;
@@ -254,20 +278,12 @@ int StringSearch<PatternChar, SubjectChar>::LinearSearch(
   Vector<const PatternChar> pattern = search->pattern_;
   DCHECK(pattern.length() > 1);
   int pattern_length = pattern.length();
-  PatternChar pattern_first_char = pattern[0];
   int i = index;
   int n = subject.length() - pattern_length;
   while (i <= n) {
-    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-      const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
-          memchr(subject.start() + i,
-                 pattern_first_char,
-                 n - i + 1));
-      if (pos == NULL) return -1;
-      i = static_cast<int>(pos - subject.start()) + 1;
-    } else {
-      if (subject[i++] != pattern_first_char) continue;
-    }
+    i = FindFirstByte(pattern, subject, i);
+    if (i == -1) return -1;
+    i++;
     // Loop extracted to separate function to allow using return to do
     // a deeper break.
     if (CharCompare(pattern.start() + 1,
@@ -505,22 +521,11 @@ int StringSearch<PatternChar, SubjectChar>::InitialSearch(

   // We know our pattern is at least 2 characters, we cache the first so
   // the common case of the first character not matching is faster.
-  PatternChar pattern_first_char = pattern[0];
   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
     badness++;
     if (badness <= 0) {
-      if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-        const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
-            memchr(subject.start() + i,
-                   pattern_first_char,
-                   n - i + 1));
-        if (pos == NULL) {
-          return -1;
-        }
-        i = static_cast<int>(pos - subject.start());
-      } else {
-        if (subject[i] != pattern_first_char) continue;
-      }
+      i = FindFirstByte(pattern, subject, i);
+      if (i == -1) return -1;
       int j = 1;
       do {
         if (pattern[j] != subject[i + j]) {


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to