Revision: 11218
Author:   [email protected]
Date:     Tue Apr  3 05:24:55 2012
Log:      Switch regexp strategy for regexps that are just plain
strings with a small alphabet.  We already have code
that handles these regexps well, we were just not always
activating it.
Review URL: https://chromiumcodereview.appspot.com/9959096
http://code.google.com/p/v8/source/detail?r=11218

Modified:
 /branches/bleeding_edge/src/jsregexp.cc

=======================================
--- /branches/bleeding_edge/src/jsregexp.cc     Mon Apr  2 02:38:07 2012
+++ /branches/bleeding_edge/src/jsregexp.cc     Tue Apr  3 05:24:55 2012
@@ -106,6 +106,36 @@
   Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
   isolate->Throw(*regexp_err);
 }
+
+
+// More makes code generation slower, less makes V8 benchmark score lower.
+const int kMaxLookaheadForBoyerMoore = 8;
+// In a 3-character pattern you can maximally step forwards 3 characters
+// at a time, which is not always enough to pay for the extra logic.
+const int kPatternTooShortForBoyerMoore = 2;
+
+
+// Identifies the sort of regexps where the regexp engine is faster
+// than the code used for atom matches.
+static bool HasFewDifferentCharacters(Handle<String> pattern) {
+  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
+  if (length <= kPatternTooShortForBoyerMoore) return false;
+  const int kMod = 128;
+  bool character_found[kMod];
+  int different = 0;
+  memset(&character_found[0], 0, sizeof(character_found));
+  for (int i = 0; i < length; i++) {
+    int ch = (pattern->Get(i) & (kMod - 1));
+    if (!character_found[ch]) {
+      character_found[ch] = true;
+      different++;
+ // We declare a regexp low-alphabet if it has at least 3 times as many
+      // characters as it has different characters.
+      if (different * 3 > length) return false;
+    }
+  }
+  return true;
+}


 // Generic RegExp methods. Dispatches to implementation specific methods.
@@ -141,9 +171,14 @@
     return Handle<Object>::null();
   }

-  if (parse_result.simple && !flags.is_ignore_case()) {
+  bool has_been_compiled = false;
+
+  if (parse_result.simple &&
+      !flags.is_ignore_case() &&
+      !HasFewDifferentCharacters(pattern)) {
     // Parse-tree is a single atom that is equal to the pattern.
     AtomCompile(re, pattern, flags, pattern);
+    has_been_compiled = true;
   } else if (parse_result.tree->IsAtom() &&
       !flags.is_ignore_case() &&
       parse_result.capture_count == 0) {
@@ -151,8 +186,12 @@
     Vector<const uc16> atom_pattern = atom->data();
     Handle<String> atom_string =
         isolate->factory()->NewStringFromTwoByte(atom_pattern);
-    AtomCompile(re, pattern, flags, atom_string);
-  } else {
+    if (!HasFewDifferentCharacters(atom_string)) {
+      AtomCompile(re, pattern, flags, atom_string);
+      has_been_compiled = true;
+    }
+  }
+  if (!has_been_compiled) {
     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
   }
   ASSERT(re->data()->IsFixedArray());
@@ -3428,6 +3467,7 @@

   return true;
 }
+

 /* Code generation for choice nodes.
  *
@@ -3507,7 +3547,6 @@
  *        \______________/
  */

-
 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
   int choice_count = alternatives_->length();
@@ -3578,9 +3617,6 @@

   bool skip_was_emitted = false;

- // More makes code generation slower, less makes V8 benchmark score lower.
-  const int kMaxLookaheadForBoyerMoore = 8;
-
   if (!greedy_loop && choice_count == 2) {
     GuardedAlternative alt1 = alternatives_->at(1);
     if (alt1.guards() == NULL || alt1.guards()->length() == 0) {

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to