[v8-dev] [v8] r11204 committed - Regexp: Improve the speed that we scan for an initial point where a no...

codesite-noreply Mon, 02 Apr 2012 02:38:59 -0700

Revision: 11204
Author:   erikcorry
Date:     Mon Apr  2 02:38:07 2012

Log: Regexp: Improve the speed that we scan for an initial point wherea non-anchoredregexp can match by using a Boyer-Moore-like table. This is done byidentifyingnon-greedy non-capturing loops in the nodes that eat any character one at atime.For example in the middle of the regexp /foo[\s\S]*?bar/ we find such aloop.There is also such a loop implicitly inserted at the start of anynon-anchored

regexp.


When we have found such a loop we look ahead in the nodes to find the set of
characters that can come at given distances.  For example for the regexp

/.?foo/ we know that there are at least 3 characters ahead of us, and thesets

of characters that can occur are [any, [f, o], [o]].  We find a range in the

lookahead info where the set of characters is reasonably constrained. Inour

example this is from index 1 to 2 (0 is not constrained).  We can now look 3
characters ahead and if we don't find one of [f, o] (the union of [f, o] and
[o]) then we can skip forwards by the range size (in this case 2).

For Unicode input strings we do the same, but modulo 128.

We also look at the first string fed to the regexp and use that to get ahint

of the character frequencies in the inputs.  This affects the assessment of
whether the set of characters is 'reasonably constrained'.

We still have the old lookahead mechanism, which uses a wide load ofmultiple

characters followed by a mask and compare to determine whether a match is
possible at this point.
Review URL: http://codereview.chromium.org/9965010
http://code.google.com/p/v8/source/detail?r=11204

Modified:
 /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc
 /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc
 /branches/bleeding_edge/src/jsregexp.cc
 /branches/bleeding_edge/src/jsregexp.h
 /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc
 /branches/bleeding_edge/test/cctest/test-regexp.cc

=======================================

--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Fri Mar30 00:43:48 2012+++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Mon Apr2 02:38:07 2012

@@ -452,8 +452,12 @@
 void RegExpMacroAssemblerARM::CheckCharacterAfterAnd(uint32_t c,
                                                      uint32_t mask,
                                                      Label* on_equal) {
-  __ and_(r0, current_character(), Operand(mask));
-  __ cmp(r0, Operand(c));
+  if (c == 0) {
+    __ tst(current_character(), Operand(mask));
+  } else {
+    __ and_(r0, current_character(), Operand(mask));
+    __ cmp(r0, Operand(c));
+  }
   BranchOrBacktrack(eq, on_equal);
 }

@@ -461,8 +465,12 @@
 void RegExpMacroAssemblerARM::CheckNotCharacterAfterAnd(unsigned c,
                                                         unsigned mask,

Label*on_not_equal) {

-  __ and_(r0, current_character(), Operand(mask));
-  __ cmp(r0, Operand(c));
+  if (c == 0) {
+    __ tst(current_character(), Operand(mask));
+  } else {
+    __ and_(r0, current_character(), Operand(mask));
+    __ cmp(r0, Operand(c));
+  }
   BranchOrBacktrack(ne, on_not_equal);
 }

=======================================

--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Fri Mar30 00:43:48 2012+++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc MonApr 2 02:38:07 2012

@@ -504,8 +504,8 @@
   if (c == 0) {
     __ test(current_character(), Immediate(mask));
   } else {
-    __ mov(eax, current_character());
-    __ and_(eax, mask);
+    __ mov(eax, mask);
+    __ and_(eax, current_character());
     __ cmp(eax, c);
   }
   BranchOrBacktrack(equal, on_equal);
@@ -518,8 +518,8 @@
   if (c == 0) {
     __ test(current_character(), Immediate(mask));
   } else {
-    __ mov(eax, current_character());
-    __ and_(eax, mask);
+    __ mov(eax, mask);
+    __ and_(eax, current_character());
     __ cmp(eax, c);
   }
   BranchOrBacktrack(not_equal, on_not_equal);
@@ -569,8 +569,8 @@
   __ mov(eax, Immediate(table));
   Register index = current_character();
   if (mode_ != ASCII || kTableMask != String::kMaxAsciiCharCode) {
-    __ mov(ebx, current_character());
-    __ and_(ebx, kTableSize - 1);
+    __ mov(ebx, kTableSize - 1);
+    __ and_(ebx, current_character());
     index = ebx;
   }
   __ cmpb(FieldOperand(eax, index, times_1, ByteArray::kHeaderSize), 0);
=======================================
--- /branches/bleeding_edge/src/jsregexp.cc     Fri Mar 30 00:43:48 2012
+++ /branches/bleeding_edge/src/jsregexp.cc     Mon Apr  2 02:38:07 2012
@@ -280,7 +280,8 @@
 // from the source pattern.
 // If compilation fails, an exception is thrown and this function
 // returns false.

-bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, boolis_ascii) {

+bool RegExpImpl::EnsureCompiledIrregexp(
+    Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
 #ifdef V8_INTERPRETED_REGEXP
   if (compiled_code->IsByteArray()) return true;
@@ -296,7 +297,7 @@
     ASSERT(compiled_code->IsSmi());
     return true;
   }
-  return CompileIrregexp(re, is_ascii);
+  return CompileIrregexp(re, sample_subject, is_ascii);
 }


@@ -316,7 +317,9 @@
 }


-bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
+bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
+                                 Handle<String> sample_subject,
+                                 bool is_ascii) {
   // Compile the RegExp.
   Isolate* isolate = re->GetIsolate();
   ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
@@ -365,6 +368,7 @@
                             flags.is_ignore_case(),
                             flags.is_multiline(),
                             pattern,
+                            sample_subject,
                             is_ascii);
   if (result.error_message != NULL) {
     // Unable to compile regexp.
@@ -435,7 +439,7 @@

   // Check the asciiness of the underlying storage.
   bool is_ascii = subject->IsAsciiRepresentationUnderneath();
-  if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
+  if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;

 #ifdef V8_INTERPRETED_REGEXP
   // Byte-code regexp needs space allocated for all its registers.
@@ -466,7 +470,7 @@
 #ifndef V8_INTERPRETED_REGEXP
   ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
   do {
-    EnsureCompiledIrregexp(regexp, is_ascii);
+    EnsureCompiledIrregexp(regexp, subject, is_ascii);
     Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
     NativeRegExpMacroAssembler::Result res =
         NativeRegExpMacroAssembler::Match(code,
@@ -784,6 +788,53 @@
 }


+class FrequencyCollator {
+ public:
+  FrequencyCollator() : total_samples_(0) {
+    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+      frequencies_[i] = CharacterFrequency(i);
+    }
+  }
+
+  void CountCharacter(int character) {
+    int index = (character & RegExpMacroAssembler::kTableMask);
+    frequencies_[index].Increment();
+    total_samples_++;
+  }
+

+ // Does not measure in percent, but rather per-128 (the table size fromthe

+  // regexp macro assembler).
+  int Frequency(int in_character) {

+ ASSERT((in_character & RegExpMacroAssembler::kTableMask) ==in_character);

+    if (total_samples_ < 1) return 1;  // Division by zero.
+    int freq_in_per128 =
+        (frequencies_[in_character].counter() * 128) / total_samples_;
+    return freq_in_per128;
+  }
+
+ private:
+  class CharacterFrequency {
+   public:
+    CharacterFrequency() : counter_(0), character_(-1) { }
+    explicit CharacterFrequency(int character)
+        : counter_(0), character_(character) { }
+
+    void Increment() { counter_++; }
+    int counter() { return counter_; }
+    int character() { return character_; }
+
+   private:
+    int counter_;
+    int character_;
+  };
+
+
+ private:
+  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
+  int total_samples_;
+};
+
+
 class RegExpCompiler {
  public:
   RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
@@ -819,6 +870,7 @@

   inline bool ignore_case() { return ignore_case_; }
   inline bool ascii() { return ascii_; }
+  FrequencyCollator* frequency_collator() { return &frequency_collator_; }

   int current_expansion_factor() { return current_expansion_factor_; }
   void set_current_expansion_factor(int value) {
@@ -837,6 +889,7 @@
   bool ascii_;
   bool reg_exp_too_big_;
   int current_expansion_factor_;
+  FrequencyCollator frequency_collator_;
 };


@@ -865,7 +918,8 @@
       ignore_case_(ignore_case),
       ascii_(ascii),
       reg_exp_too_big_(false),
-      current_expansion_factor_(1) {
+      current_expansion_factor_(1),
+      frequency_collator_() {
   accept_ = new EndNode(EndNode::ACCEPT);
   ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
 }
@@ -2054,6 +2108,17 @@
                                    recursion_depth + 1,
                                    not_at_start);
 }
+
+
+void ActionNode::FillInBMInfo(int offset,
+                              BoyerMooreLookahead* bm,
+                              bool not_at_start) {
+  if (type_ == BEGIN_SUBMATCH) {
+    bm->SetRest(offset);
+  } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
+    on_success()->FillInBMInfo(offset, bm, not_at_start);
+  }
+}


 int AssertionNode::EatsAtLeast(int still_to_find,
@@ -2070,6 +2135,14 @@
                                    recursion_depth + 1,
                                    not_at_start);
 }
+
+
+void AssertionNode::FillInBMInfo(
+    int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+  // Match the behaviour of EatsAtLeast on this node.
+  if (type() == AT_START && not_at_start) return;
+  on_success()->FillInBMInfo(offset, bm, not_at_start);
+}


 int BackReferenceNode::EatsAtLeast(int still_to_find,
@@ -2502,6 +2575,16 @@
                                           characters_filled_in,
                                           not_at_start);
 }
+
+
+void LoopChoiceNode::FillInBMInfo(
+    int offset, BoyerMooreLookahead* bm, bool nas) {
+  if (body_can_be_zero_length_) {
+    bm->SetRest(offset);
+    return;
+  }
+  ChoiceNode::FillInBMInfo(offset, bm, nas);
+}


 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
@@ -2998,6 +3081,30 @@
     return elm.cp_offset + elm.data.u_atom->data().length();
   }
 }
+
+
+RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
+    RegExpCompiler* compiler) {
+  if (elms_->length() != 1) return NULL;
+  TextElement elm = elms_->at(0);
+  if (elm.type != TextElement::CHAR_CLASS) return NULL;
+  RegExpCharacterClass* node = elm.data.u_char_class;
+  ZoneList<CharacterRange>* ranges = node->ranges();
+  if (!CharacterRange::IsCanonical(ranges)) {
+    CharacterRange::Canonicalize(ranges);
+  }
+  if (node->is_negated()) {
+    return ranges->length() == 0 ? on_success() : NULL;
+  }
+  if (ranges->length() != 1) return NULL;
+  uint32_t max_char;
+  if (compiler->ascii()) {
+    max_char = String::kMaxAsciiCharCode;
+  } else {
+    max_char = String::kMaxUtf16CodeUnit;
+  }
+  return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
+}


 // Finds the fixed match length of a sequence of nodes that goes from
@@ -3064,8 +3171,8 @@


 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
-                                           bool not_at_start) {
-  int preload_characters = EatsAtLeast(4, 0, not_at_start);
+                                           int eats_at_least) {
+  int preload_characters = Min(4, eats_at_least);
   if (compiler->macro_assembler()->CanReadUnaligned()) {
     bool ascii = compiler->ascii();
     if (ascii) {
@@ -3130,6 +3237,197 @@
   AlternativeGeneration a_few_alt_gens_[kAFew];
 };

+
+BoyerMooreLookahead::BoyerMooreLookahead(
+    int length, int map_length, RegExpCompiler* compiler)
+    : length_(length),
+      map_length_(map_length),
+      compiler_(compiler) {
+  ASSERT(IsPowerOf2(map_length));
+  if (compiler->ascii()) {
+    max_char_ = String::kMaxAsciiCharCode;
+  } else {
+    max_char_ = String::kMaxUtf16CodeUnit;
+  }
+  bitmaps_ = new ZoneList<ZoneList<bool>*>(length);
+  for (int i = 0; i < length; i++) {
+    bitmaps_->Add(new ZoneList<bool>(map_length));
+    ZoneList<bool>* map = bitmaps_->at(i);
+    for (int i = 0; i < map_length; i++) {
+      map->Add(false);
+    }
+  }
+}
+
+

+// Find the longest range of lookahead that has the fewest number ofdifferent+// characters that can occur at a given position. Since we are optimizingtwo

+// different parameters at once this is a tradeoff.
+bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
+  int biggest_points = 0;
+  for (int max_number_of_chars = 4;
+       max_number_of_chars < kTooManyCharacters;
+       max_number_of_chars *= 2) {
+    biggest_points =
+        FindBestInterval(max_number_of_chars, biggest_points, from, to);
+  }
+  if (biggest_points == 0) return false;
+  return true;
+}
+
+
+// Find the highest-points range between 0 and length_ where the character

+// information is not too vague. 'Too vague' means that there are morethan+// max_number_of_chars that can occur at this position. Calculates thenumber

+// of points as the product of width-of-the-range and
+// probability-of-finding-one-of-the-characters, where the probability is

+// calculated using the frequency distribution of the sample subjectstring.

+int BoyerMooreLookahead::FindBestInterval(
+    int max_number_of_chars, int old_biggest_points, int* from, int* to) {
+  int biggest_points = old_biggest_points;
+  static const int kSize = RegExpMacroAssembler::kTableSize;
+  for (int i = 0; i < length_; ) {
+    while (i < length_ && Count(i) > max_number_of_chars) i++;
+    if (i == length_) break;
+    int remembered_from = i;
+    bool union_map[kSize];
+    for (int j = 0; j < kSize; j++) union_map[j] = false;
+    while (i < length_ && Count(i) <= max_number_of_chars) {
+      ZoneList<bool>* map = bitmaps_->at(i);
+      for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
+      i++;
+    }
+    int frequency = 0;
+    for (int j = 0; j < kSize; j++) {
+      if (union_map[j]) {
+        // Add 1 to the frequency to give a small per-character boost for
+        // the cases where our sampling is not good enough and many
+        // characters have a frequency of zero.  This means the frequency
+        // can theoretically be up to 2*kSize though we treat it mostly as
+        // a fraction of kSize.
+        frequency += compiler_->frequency_collator()->Frequency(j) + 1;
+      }
+    }

+ // We use the probability of skipping times the distance we areskipping to

+    // judge the effectiveness of this.  Actually we have a cut-off:  By

+ // dividing by 2 we switch off the skipping if the probability ofskipping

+    // is less than 50%.  This is because the multibyte mask-and-compare
+    // skipping in quickcheck is more likely to do well on this case.
+    bool in_quickcheck_range = ((i - remembered_from < 4) ||

+ (compiler_->ascii() ? remembered_from <= 4 : remembered_from <=2));+ // Called 'probability' but it is only a rough estimate and canactually

+    // be outside the 0-kSize range.

+ int probability = (in_quickcheck_range ? kSize / 2 : kSize) -frequency;

+    int points = (i - remembered_from) * probability;
+    if (points > biggest_points) {
+      *from = remembered_from;
+      *to = i - 1;
+      biggest_points = points;
+    }
+  }
+  return biggest_points;
+}
+
+
+// Take all the characters that will not prevent a successful match if they
+// occur in the subject string in the range between min_lookahead and
+// max_lookahead (inclusive) measured from the current position.  If the

+// character at max_lookahead offset is not one of these characters, thenwe

+// can safely skip forwards by the number of characters in the range.
+int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
+                                      int max_lookahead,

+ Handle<ByteArray>boolean_skip_table) {

+  const int kSize = RegExpMacroAssembler::kTableSize;
+
+  const int kSkipArrayEntry = 0;
+  const int kDontSkipArrayEntry = 1;
+
+  for (int i = 0; i < kSize; i++) {
+    boolean_skip_table->set(i, kSkipArrayEntry);
+  }
+  int skip = max_lookahead + 1 - min_lookahead;
+
+  for (int i = max_lookahead; i >= min_lookahead; i--) {
+    ZoneList<bool>* map = bitmaps_->at(i);
+    for (int j = 0; j < map_length_; j++) {
+      if (map->at(j)) {
+        boolean_skip_table->set(j, kDontSkipArrayEntry);
+      }
+    }
+  }
+
+  return skip;
+}
+
+
+// See comment above on the implementation of GetSkipTable.

+bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm){

+  int min_lookahead = 0;
+  int max_lookahead = 0;
+

+ if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) returnfalse;

+
+  bool found_single_character = false;
+  bool abandoned_search_for_single_character = false;
+  int single_character = 0;
+  for (int i = max_lookahead; i >= min_lookahead; i--) {
+    ZoneList<bool>* map = bitmaps_->at(i);
+    for (int j = 0; j < map_length_; j++) {
+      if (map->at(j)) {
+        if (found_single_character) {
+          found_single_character = false;  // Found two.
+          abandoned_search_for_single_character = true;
+          break;
+        } else {
+          found_single_character = true;
+          single_character = j;
+        }
+      }
+    }
+    if (abandoned_search_for_single_character) break;
+  }
+
+  int lookahead_width = max_lookahead + 1 - min_lookahead;
+

+ if (found_single_character && lookahead_width == 1 && max_lookahead < 3){

+    // The mask-compare can probably handle this better.
+    return false;
+  }
+
+  if (found_single_character) {
+    Label cont, again;
+    masm->Bind(&again);
+    masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+    if (max_char_ > map_length_) {
+      ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);
+      masm->CheckCharacterAfterAnd(single_character,
+                                   RegExpMacroAssembler::kTableMask,
+                                   &cont);
+    } else {
+      masm->CheckCharacter(single_character, &cont);
+    }
+    masm->AdvanceCurrentPosition(lookahead_width);
+    masm->GoTo(&again);
+    masm->Bind(&cont);
+    return true;
+  }
+
+  Handle<ByteArray> boolean_skip_table =
+      FACTORY->NewByteArray(map_length_, TENURED);
+  int skip_distance = GetSkipTable(
+      min_lookahead, max_lookahead, boolean_skip_table);
+  ASSERT(skip_distance != 0);
+
+  Label cont, again;
+  masm->Bind(&again);
+  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+  masm->CheckBitInTable(boolean_skip_table, &cont);
+  masm->AdvanceCurrentPosition(skip_distance);
+  masm->GoTo(&again);
+  masm->Bind(&cont);
+
+  return true;
+}

 /* Code generation for choice nodes.
  *
@@ -3274,10 +3572,51 @@

   int first_normal_choice = greedy_loop ? 1 : 0;

-  int preload_characters =
-      CalculatePreloadCharacters(compiler,

- current_trace->at_start() ==Trace::FALSE);

-  bool preload_is_current =
+  bool not_at_start = current_trace->at_start() == Trace::FALSE;
+  const int kEatsAtLeastNotYetInitialized = -1;
+  int eats_at_least = kEatsAtLeastNotYetInitialized;
+
+  bool skip_was_emitted = false;
+

+ // More makes code generation slower, less makes V8 benchmark scorelower.

+  const int kMaxLookaheadForBoyerMoore = 8;
+
+  if (!greedy_loop && choice_count == 2) {
+    GuardedAlternative alt1 = alternatives_->at(1);
+    if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
+      RegExpNode* eats_anything_node = alt1.node();
+      if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
+          this) {

+ // At this point we know that we are at a non-greedy loop thatwill eat

+        // any character one at a time.  Any non-anchored regexp has such a

+ // loop prepended to it in order to find where it starts. We lookfor+ // a pattern of the form ...abc... where we can look 6 charactersahead+ // and step forwards 3 if the character is not one of abc. Abcneed+ // not be atoms, they can be any reasonably limited characterclass or

+        // small alternation.

+ ASSERT(trace->is_trivial()); // This is the case onLoopChoiceNodes.

+        eats_at_least =
+            Min(kMaxLookaheadForBoyerMoore,
+                EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
+        if (eats_at_least >= 1) {
+          BoyerMooreLookahead bm(eats_at_least,
+                                 RegExpMacroAssembler::kTableSize,
+                                 compiler);
+          GuardedAlternative alt0 = alternatives_->at(0);
+          alt0.node()->FillInBMInfo(0, &bm, not_at_start);
+          skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);
+        }
+      }
+    }
+  }
+
+  if (eats_at_least == kEatsAtLeastNotYetInitialized) {
+    // Save some time by looking at most one machine word ahead.

+ eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0,not_at_start);

+  }

+ int preload_characters = CalculatePreloadCharacters(compiler,eats_at_least);

+
+  bool preload_is_current = !skip_was_emitted &&
       (current_trace->characters_preloaded() == preload_characters);
   bool preload_has_checked_bounds = preload_is_current;

@@ -5430,6 +5769,76 @@
   // not to be.
   return kComputeFirstCharacterSetFail;
 }
+
+
+void ChoiceNode::FillInBMInfo(
+    int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+  ZoneList<GuardedAlternative>* alts = alternatives();
+  for (int i = 0; i < alts->length(); i++) {
+    GuardedAlternative& alt = alts->at(i);
+    if (alt.guards() != NULL && alt.guards()->length() != 0) {
+      bm->SetRest(offset);  // Give up trying to fill in info.
+      return;
+    }
+    alt.node()->FillInBMInfo(offset, bm, not_at_start);
+  }
+}
+
+
+void TextNode::FillInBMInfo(
+    int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+  if (offset >= bm->length()) return;
+  int max_char = bm->max_char();
+  for (int i = 0; i < elements()->length(); i++) {
+    if (offset >= bm->length()) return;
+    TextElement text = elements()->at(i);
+    if (text.type == TextElement::ATOM) {
+      RegExpAtom* atom = text.data.u_atom;
+      for (int j = 0; j < atom->length(); j++, offset++) {
+        if (offset >= bm->length()) return;
+        uc16 character = atom->data()[j];
+        if (bm->compiler()->ignore_case()) {
+          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+          int length = GetCaseIndependentLetters(
+              ISOLATE,
+              character,
+              bm->max_char() == String::kMaxAsciiCharCode,
+              chars);
+          for (int j = 0; j < length; j++) {
+            bm->Set(offset, chars[j]);
+          }
+        } else {
+          if (character <= max_char) bm->Set(offset, character);
+        }
+      }
+    } else {
+      ASSERT(text.type == TextElement::CHAR_CLASS);
+      RegExpCharacterClass* char_class = text.data.u_char_class;
+      ZoneList<CharacterRange>* ranges = char_class->ranges();
+      if (char_class->is_negated()) {
+        bm->SetAll(offset);
+      } else {
+        for (int k = 0; k < ranges->length(); k++) {
+          CharacterRange& range = ranges->at(k);
+          if (range.from() > max_char) continue;
+          int to = Min(max_char, static_cast<int>(range.to()));

+ if (to - range.from() >=BoyerMooreLookahead::kTooManyCharacters) {

+            bm->SetAll(offset);
+            break;
+          }
+          for (int m = range.from(); m <= to; m++) {
+            bm->Set(offset, m);
+          }
+        }
+      }
+      offset++;
+    }
+  }
+  if (offset >= bm->length()) return;
+  on_success()->FillInBMInfo(offset,
+                             bm,
+                             true);  // Not at start after a text node.
+}


 int TextNode::ComputeFirstCharacterSet(int budget) {
@@ -5589,15 +5998,30 @@
 }

-RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData*data,

-                                                      bool ignore_case,
-                                                      bool is_multiline,

- Handle<String>pattern,

-                                                      bool is_ascii) {
+RegExpEngine::CompilationResult RegExpEngine::Compile(
+    RegExpCompileData* data,
+    bool ignore_case,
+    bool is_multiline,
+    Handle<String> pattern,
+    Handle<String> sample_subject,
+    bool is_ascii) {

if ((data->capture_count + 1) * 2 - 1 >RegExpMacroAssembler::kMaxRegister) {

     return IrregexpRegExpTooBig();
   }
   RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
+
+  // Sample some characters from the middle of the string.
+  static const int kSampleSize = 128;
+
+  FlattenString(sample_subject);
+  int chars_sampled = 0;
+  int half_way = (sample_subject->length() - kSampleSize) / 2;
+  for (int i = Max(0, half_way);
+       i < sample_subject->length() && chars_sampled < kSampleSize;
+       i++, chars_sampled++) {
+    compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
+  }
+
   // Wrap the body of the regexp in capture #0.
   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
                                                     0,
=======================================
--- /branches/bleeding_edge/src/jsregexp.h      Tue Feb 28 02:32:02 2012
+++ /branches/bleeding_edge/src/jsregexp.h      Mon Apr  2 02:38:07 2012
@@ -190,8 +190,10 @@
   static String* last_ascii_string_;
   static String* two_byte_cached_string_;

-  static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii);

- static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, boolis_ascii);

+  static bool CompileIrregexp(
+      Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
+  static inline bool EnsureCompiledIrregexp(
+      Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);

// Set the subject cache. The previous string buffer is not deleted, sothe

@@ -419,6 +421,90 @@
 };


+// Improve the speed that we scan for an initial point where a non-anchored
+// regexp can match by using a Boyer-Moore-like table. This is done by
+// identifying non-greedy non-capturing loops in the nodes that eat any
+// character one at a time.  For example in the middle of the regexp

+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loopimplicitly

+// inserted at the start of any non-anchored regexp.
+//

+// When we have found such a loop we look ahead in the nodes to find theset of

+// characters that can come at given distances. For example for the regexp

+// /.?foo/ we know that there are at least 3 characters ahead of us, andthe+// sets of characters that can occur are [any, [f, o], [o]]. We find arange in+// the lookahead info where the set of characters is reasonablyconstrained. In

+// our example this is from index 1 to 2 (0 is not constrained). We can now
+// look 3 characters ahead and if we don't find one of [f, o] (the union of

+// [f, o] and [o]) then we can skip forwards by the range size (in thiscase 2).

+//
+// For Unicode input strings we do the same, but modulo 128.
+//

+// We also look at the first string fed to the regexp and use that to geta hint+// of the character frequencies in the inputs. This affects the assessmentof

+// whether the set of characters is 'reasonably constrained'.
+//

+// We also have another lookahead mechanism (called quick check in thecode),+// which uses a wide load of multiple characters followed by a mask andcompare

+// to determine whether a match is possible at this point.
+class BoyerMooreLookahead {
+ public:

+ BoyerMooreLookahead(int length, int map_length, RegExpCompiler*compiler);

+
+  int length() { return length_; }
+  int max_char() { return max_char_; }
+  RegExpCompiler* compiler() { return compiler_; }
+
+  static const int kTooManyCharacters = 32;
+
+  int Count(int map_number) {
+    ZoneList<bool>* map = bitmaps_->at(map_number);
+    if (map == NULL) return map_length_;
+    int count = 0;
+    for (int i = 0; i < map_length_; i++) {
+      if (map->at(i)) count++;
+    }
+    return count;
+  }
+
+  void Set(int map_number, int character) {
+    if (character > max_char_) return;
+    ZoneList<bool>* map = bitmaps_->at(map_number);
+    if (map == NULL) return;
+    map->at(character & (map_length_ - 1)) = true;
+  }
+
+  void SetAll(int map_number) {
+    bitmaps_->at(map_number) = NULL;
+  }
+
+  void SetRest(int from_map) {
+    for (int i = from_map; i < length_; i++) SetAll(i);
+  }
+  bool EmitSkipInstructions(RegExpMacroAssembler* masm);
+
+ private:

+ // This is the value obtained by EatsAtLeast. If we do not have atleast this+ // many characters left in the sample string then the match is bound tofail.+ // Therefore it is OK to read a character this far ahead of the currentmatch

+  // point.
+  int length_;

+ // We conservatively consider all character values modulo this length.For

+  // ASCII there is no loss of precision, since this has a value of 128.
+  int map_length_;
+  RegExpCompiler* compiler_;
+  // 0x7f for ASCII, 0xffff for UTF-16.
+  int max_char_;
+  ZoneList<ZoneList<bool>*>* bitmaps_;
+
+  int GetSkipTable(int min_lookahead,
+                   int max_lookahead,
+                   Handle<ByteArray> boolean_skip_table);
+  bool FindWorthwhileInterval(int* from, int* to);
+  int FindBestInterval(
+    int max_number_of_chars, int old_biggest_points, int* from, int* to);
+};
+
+
 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
   VISIT(End)                                                         \
   VISIT(Action)                                                      \
@@ -635,6 +721,22 @@
                                     bool not_at_start) = 0;
   static const int kNodeIsTooComplexForGreedyLoops = -1;

virtual int GreedyLoopTextLength() { returnkNodeIsTooComplexForGreedyLoops; }+ // Only returns the successor for a text node of length 1 that matchesany

+  // character and that has no guards on it.
+  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+      RegExpCompiler* compiler) {
+    return NULL;
+  }
+

+ // Collects information on the possible code units (mod 128) that canmatch if

+  // we look forward.  This is used for a Boyer-Moore-like string searching
+  // implementation.  TODO(erikcorry):  This should share more code with
+  // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet.
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+    UNREACHABLE();
+  }
+
   Label* label() { return &label_; }

// If non-generic code is generated for a node (i.e. the node is not atthe// start of the trace) then it cannot be reused. This variable sets alimit

@@ -747,6 +849,10 @@
       : on_success_(on_success) { }
   RegExpNode* on_success() { return on_success_; }
   void set_on_success(RegExpNode* node) { on_success_ = node; }
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+    on_success_->FillInBMInfo(offset, bm, not_at_start);
+  }
  private:
   RegExpNode* on_success_;
 };
@@ -793,6 +899,8 @@
     return on_success()->GetQuickCheckDetails(
         details, compiler, filled_in, not_at_start);
   }
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start);
   Type type() { return type_; }
   // TODO(erikcorry): We should allow some action nodes in greedy loops.

virtual int GreedyLoopTextLength() { returnkNodeIsTooComplexForGreedyLoops; }

@@ -860,6 +968,10 @@
   ZoneList<TextElement>* elements() { return elms_; }
   void MakeCaseIndependent(bool is_ascii);
   virtual int GreedyLoopTextLength();
+  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+      RegExpCompiler* compiler);
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start);
   virtual TextNode* Clone() {
     TextNode* result = new TextNode(*this);
     result->CalculateOffsets();
@@ -928,6 +1040,8 @@
                                     RegExpCompiler* compiler,
                                     int filled_in,
                                     bool not_at_start);
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start);
   virtual int ComputeFirstCharacterSet(int budget);
   virtual AssertionNode* Clone() { return new AssertionNode(*this); }
   AssertionNodeType type() { return type_; }
@@ -961,6 +1075,12 @@
                                     bool not_at_start) {
     return;
   }
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ // Working out the set of characters that a backreference can match istoo

+    // hard, so we just say that any character can match.
+    bm->SetRest(offset);
+  }

virtual BackReferenceNode* Clone() { return newBackReferenceNode(*this); }

   virtual int ComputeFirstCharacterSet(int budget);

@@ -986,6 +1106,11 @@
     // Returning 0 from EatsAtLeast should ensure we never get here.
     UNREACHABLE();
   }
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+    // Returning 0 from EatsAtLeast should ensure we never get here.
+    UNREACHABLE();
+  }
   virtual EndNode* Clone() { return new EndNode(*this); }
  private:
   Action action_;
@@ -1071,6 +1196,8 @@
                                     RegExpCompiler* compiler,
                                     int characters_filled_in,
                                     bool not_at_start);
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start);
   virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }

   bool being_calculated() { return being_calculated_; }
@@ -1089,7 +1216,7 @@
   void GenerateGuard(RegExpMacroAssembler* macro_assembler,
                      Guard* guard,
                      Trace* trace);

- int CalculatePreloadCharacters(RegExpCompiler* compiler, boolnot_at_start);+ int CalculatePreloadCharacters(RegExpCompiler* compiler, inteats_at_least);

   void EmitOutOfLineContinuation(RegExpCompiler* compiler,
                                  Trace* trace,
                                  GuardedAlternative alternative,
@@ -1119,6 +1246,10 @@
                                     RegExpCompiler* compiler,
                                     int characters_filled_in,
                                     bool not_at_start);
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+    alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);
+  }
   // For a negative lookahead we don't emit the quick check for the

// alternative that is expected to fail. This is because quick checkcode// starts by loading enough characters for the alternative that takesfewest

@@ -1146,6 +1277,8 @@
                                     RegExpCompiler* compiler,
                                     int characters_filled_in,
                                     bool not_at_start);
+  virtual void FillInBMInfo(
+      int offset, BoyerMooreLookahead* bm, bool not_at_start);
   virtual int ComputeFirstCharacterSet(int budget);
   virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
   RegExpNode* loop_node() { return loop_node_; }
@@ -1458,6 +1591,7 @@
                                    bool ignore_case,
                                    bool multiline,
                                    Handle<String> pattern,
+                                   Handle<String> sample_subject,
                                    bool is_ascii);

static void DotPrint(const char* label, RegExpNode* node, boolignore_case);

=======================================

--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Fri Mar30 00:43:48 2012+++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Mon Apr2 02:38:07 2012

@@ -542,9 +542,13 @@
 void RegExpMacroAssemblerX64::CheckCharacterAfterAnd(uint32_t c,
                                                      uint32_t mask,
                                                      Label* on_equal) {
-  __ movl(rax, current_character());
-  __ and_(rax, Immediate(mask));
-  __ cmpl(rax, Immediate(c));
+  if (c == 0) {
+    __ testl(current_character(), Immediate(mask));
+  } else {
+    __ movl(rax, Immediate(mask));
+    __ and_(rax, current_character());
+    __ cmpl(rax, Immediate(c));
+  }
   BranchOrBacktrack(equal, on_equal);
 }

@@ -552,9 +556,13 @@
 void RegExpMacroAssemblerX64::CheckNotCharacterAfterAnd(uint32_t c,
                                                         uint32_t mask,

Label*on_not_equal) {

-  __ movl(rax, current_character());
-  __ and_(rax, Immediate(mask));
-  __ cmpl(rax, Immediate(c));
+  if (c == 0) {
+    __ testl(current_character(), Immediate(mask));
+  } else {
+    __ movl(rax, Immediate(mask));
+    __ and_(rax, current_character());
+    __ cmpl(rax, Immediate(c));
+  }
   BranchOrBacktrack(not_equal, on_not_equal);
 }

=======================================

--- /branches/bleeding_edge/test/cctest/test-regexp.cc Mon Feb 20 10:01:212012+++ /branches/bleeding_edge/test/cctest/test-regexp.cc Mon Apr 2 02:38:072012

@@ -504,7 +504,10 @@
     return NULL;
   Handle<String> pattern = isolate->factory()->
       NewStringFromUtf8(CStrVector(input));

- RegExpEngine::Compile(&compile_data, false, multiline, pattern,is_ascii);

+  Handle<String> sample_subject =
+      isolate->factory()->NewStringFromUtf8(CStrVector(""));
+  RegExpEngine::Compile(
+      &compile_data, false, multiline, pattern, sample_subject, is_ascii);
   return compile_data.node;
 }

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

[v8-dev] [v8] r11204 committed - Regexp: Improve the speed that we scan for an initial point where a no...

Reply via email to