Revision: 11204
Author: erikcorry
Date: Mon Apr 2 02:38:07 2012
Log: Regexp: Improve the speed that we scan for an initial point where
a non-anchored
regexp can match by using a Boyer-Moore-like table. This is done by
identifying
non-greedy non-capturing loops in the nodes that eat any character one at a
time.
For example in the middle of the regexp /foo[\s\S]*?bar/ we find such a
loop.
There is also such a loop implicitly inserted at the start of any
non-anchored
regexp.
When we have found such a loop we look ahead in the nodes to find the set of
characters that can come at given distances. For example for the regexp
/.?foo/ we know that there are at least 3 characters ahead of us, and the
sets
of characters that can occur are [any, [f, o], [o]]. We find a range in the
lookahead info where the set of characters is reasonably constrained. In
our
example this is from index 1 to 2 (0 is not constrained). We can now look 3
characters ahead and if we don't find one of [f, o] (the union of [f, o] and
[o]) then we can skip forwards by the range size (in this case 2).
For Unicode input strings we do the same, but modulo 128.
We also look at the first string fed to the regexp and use that to get a
hint
of the character frequencies in the inputs. This affects the assessment of
whether the set of characters is 'reasonably constrained'.
We still have the old lookahead mechanism, which uses a wide load of
multiple
characters followed by a mask and compare to determine whether a match is
possible at this point.
Review URL: http://codereview.chromium.org/9965010
http://code.google.com/p/v8/source/detail?r=11204
Modified:
/branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc
/branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc
/branches/bleeding_edge/src/jsregexp.cc
/branches/bleeding_edge/src/jsregexp.h
/branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc
/branches/bleeding_edge/test/cctest/test-regexp.cc
=======================================
--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Fri Mar
30 00:43:48 2012
+++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Mon Apr
2 02:38:07 2012
@@ -452,8 +452,12 @@
void RegExpMacroAssemblerARM::CheckCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_equal) {
- __ and_(r0, current_character(), Operand(mask));
- __ cmp(r0, Operand(c));
+ if (c == 0) {
+ __ tst(current_character(), Operand(mask));
+ } else {
+ __ and_(r0, current_character(), Operand(mask));
+ __ cmp(r0, Operand(c));
+ }
BranchOrBacktrack(eq, on_equal);
}
@@ -461,8 +465,12 @@
void RegExpMacroAssemblerARM::CheckNotCharacterAfterAnd(unsigned c,
unsigned mask,
Label*
on_not_equal) {
- __ and_(r0, current_character(), Operand(mask));
- __ cmp(r0, Operand(c));
+ if (c == 0) {
+ __ tst(current_character(), Operand(mask));
+ } else {
+ __ and_(r0, current_character(), Operand(mask));
+ __ cmp(r0, Operand(c));
+ }
BranchOrBacktrack(ne, on_not_equal);
}
=======================================
--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Fri Mar
30 00:43:48 2012
+++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Mon
Apr 2 02:38:07 2012
@@ -504,8 +504,8 @@
if (c == 0) {
__ test(current_character(), Immediate(mask));
} else {
- __ mov(eax, current_character());
- __ and_(eax, mask);
+ __ mov(eax, mask);
+ __ and_(eax, current_character());
__ cmp(eax, c);
}
BranchOrBacktrack(equal, on_equal);
@@ -518,8 +518,8 @@
if (c == 0) {
__ test(current_character(), Immediate(mask));
} else {
- __ mov(eax, current_character());
- __ and_(eax, mask);
+ __ mov(eax, mask);
+ __ and_(eax, current_character());
__ cmp(eax, c);
}
BranchOrBacktrack(not_equal, on_not_equal);
@@ -569,8 +569,8 @@
__ mov(eax, Immediate(table));
Register index = current_character();
if (mode_ != ASCII || kTableMask != String::kMaxAsciiCharCode) {
- __ mov(ebx, current_character());
- __ and_(ebx, kTableSize - 1);
+ __ mov(ebx, kTableSize - 1);
+ __ and_(ebx, current_character());
index = ebx;
}
__ cmpb(FieldOperand(eax, index, times_1, ByteArray::kHeaderSize), 0);
=======================================
--- /branches/bleeding_edge/src/jsregexp.cc Fri Mar 30 00:43:48 2012
+++ /branches/bleeding_edge/src/jsregexp.cc Mon Apr 2 02:38:07 2012
@@ -280,7 +280,8 @@
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
-bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool
is_ascii) {
+bool RegExpImpl::EnsureCompiledIrregexp(
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
#ifdef V8_INTERPRETED_REGEXP
if (compiled_code->IsByteArray()) return true;
@@ -296,7 +297,7 @@
ASSERT(compiled_code->IsSmi());
return true;
}
- return CompileIrregexp(re, is_ascii);
+ return CompileIrregexp(re, sample_subject, is_ascii);
}
@@ -316,7 +317,9 @@
}
-bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
+bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
+ Handle<String> sample_subject,
+ bool is_ascii) {
// Compile the RegExp.
Isolate* isolate = re->GetIsolate();
ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
@@ -365,6 +368,7 @@
flags.is_ignore_case(),
flags.is_multiline(),
pattern,
+ sample_subject,
is_ascii);
if (result.error_message != NULL) {
// Unable to compile regexp.
@@ -435,7 +439,7 @@
// Check the asciiness of the underlying storage.
bool is_ascii = subject->IsAsciiRepresentationUnderneath();
- if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
+ if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
#ifdef V8_INTERPRETED_REGEXP
// Byte-code regexp needs space allocated for all its registers.
@@ -466,7 +470,7 @@
#ifndef V8_INTERPRETED_REGEXP
ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
do {
- EnsureCompiledIrregexp(regexp, is_ascii);
+ EnsureCompiledIrregexp(regexp, subject, is_ascii);
Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
NativeRegExpMacroAssembler::Result res =
NativeRegExpMacroAssembler::Match(code,
@@ -784,6 +788,53 @@
}
+class FrequencyCollator {
+ public:
+ FrequencyCollator() : total_samples_(0) {
+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+ frequencies_[i] = CharacterFrequency(i);
+ }
+ }
+
+ void CountCharacter(int character) {
+ int index = (character & RegExpMacroAssembler::kTableMask);
+ frequencies_[index].Increment();
+ total_samples_++;
+ }
+
+ // Does not measure in percent, but rather per-128 (the table size from
the
+ // regexp macro assembler).
+ int Frequency(int in_character) {
+ ASSERT((in_character & RegExpMacroAssembler::kTableMask) ==
in_character);
+ if (total_samples_ < 1) return 1; // Division by zero.
+ int freq_in_per128 =
+ (frequencies_[in_character].counter() * 128) / total_samples_;
+ return freq_in_per128;
+ }
+
+ private:
+ class CharacterFrequency {
+ public:
+ CharacterFrequency() : counter_(0), character_(-1) { }
+ explicit CharacterFrequency(int character)
+ : counter_(0), character_(character) { }
+
+ void Increment() { counter_++; }
+ int counter() { return counter_; }
+ int character() { return character_; }
+
+ private:
+ int counter_;
+ int character_;
+ };
+
+
+ private:
+ CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
+ int total_samples_;
+};
+
+
class RegExpCompiler {
public:
RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
@@ -819,6 +870,7 @@
inline bool ignore_case() { return ignore_case_; }
inline bool ascii() { return ascii_; }
+ FrequencyCollator* frequency_collator() { return &frequency_collator_; }
int current_expansion_factor() { return current_expansion_factor_; }
void set_current_expansion_factor(int value) {
@@ -837,6 +889,7 @@
bool ascii_;
bool reg_exp_too_big_;
int current_expansion_factor_;
+ FrequencyCollator frequency_collator_;
};
@@ -865,7 +918,8 @@
ignore_case_(ignore_case),
ascii_(ascii),
reg_exp_too_big_(false),
- current_expansion_factor_(1) {
+ current_expansion_factor_(1),
+ frequency_collator_() {
accept_ = new EndNode(EndNode::ACCEPT);
ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
}
@@ -2054,6 +2108,17 @@
recursion_depth + 1,
not_at_start);
}
+
+
+void ActionNode::FillInBMInfo(int offset,
+ BoyerMooreLookahead* bm,
+ bool not_at_start) {
+ if (type_ == BEGIN_SUBMATCH) {
+ bm->SetRest(offset);
+ } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
+ on_success()->FillInBMInfo(offset, bm, not_at_start);
+ }
+}
int AssertionNode::EatsAtLeast(int still_to_find,
@@ -2070,6 +2135,14 @@
recursion_depth + 1,
not_at_start);
}
+
+
+void AssertionNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ // Match the behaviour of EatsAtLeast on this node.
+ if (type() == AT_START && not_at_start) return;
+ on_success()->FillInBMInfo(offset, bm, not_at_start);
+}
int BackReferenceNode::EatsAtLeast(int still_to_find,
@@ -2502,6 +2575,16 @@
characters_filled_in,
not_at_start);
}
+
+
+void LoopChoiceNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool nas) {
+ if (body_can_be_zero_length_) {
+ bm->SetRest(offset);
+ return;
+ }
+ ChoiceNode::FillInBMInfo(offset, bm, nas);
+}
void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
@@ -2998,6 +3081,30 @@
return elm.cp_offset + elm.data.u_atom->data().length();
}
}
+
+
+RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) {
+ if (elms_->length() != 1) return NULL;
+ TextElement elm = elms_->at(0);
+ if (elm.type != TextElement::CHAR_CLASS) return NULL;
+ RegExpCharacterClass* node = elm.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = node->ranges();
+ if (!CharacterRange::IsCanonical(ranges)) {
+ CharacterRange::Canonicalize(ranges);
+ }
+ if (node->is_negated()) {
+ return ranges->length() == 0 ? on_success() : NULL;
+ }
+ if (ranges->length() != 1) return NULL;
+ uint32_t max_char;
+ if (compiler->ascii()) {
+ max_char = String::kMaxAsciiCharCode;
+ } else {
+ max_char = String::kMaxUtf16CodeUnit;
+ }
+ return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
+}
// Finds the fixed match length of a sequence of nodes that goes from
@@ -3064,8 +3171,8 @@
int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
- bool not_at_start) {
- int preload_characters = EatsAtLeast(4, 0, not_at_start);
+ int eats_at_least) {
+ int preload_characters = Min(4, eats_at_least);
if (compiler->macro_assembler()->CanReadUnaligned()) {
bool ascii = compiler->ascii();
if (ascii) {
@@ -3130,6 +3237,197 @@
AlternativeGeneration a_few_alt_gens_[kAFew];
};
+
+BoyerMooreLookahead::BoyerMooreLookahead(
+ int length, int map_length, RegExpCompiler* compiler)
+ : length_(length),
+ map_length_(map_length),
+ compiler_(compiler) {
+ ASSERT(IsPowerOf2(map_length));
+ if (compiler->ascii()) {
+ max_char_ = String::kMaxAsciiCharCode;
+ } else {
+ max_char_ = String::kMaxUtf16CodeUnit;
+ }
+ bitmaps_ = new ZoneList<ZoneList<bool>*>(length);
+ for (int i = 0; i < length; i++) {
+ bitmaps_->Add(new ZoneList<bool>(map_length));
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int i = 0; i < map_length; i++) {
+ map->Add(false);
+ }
+ }
+}
+
+
+// Find the longest range of lookahead that has the fewest number of
different
+// characters that can occur at a given position. Since we are optimizing
two
+// different parameters at once this is a tradeoff.
+bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
+ int biggest_points = 0;
+ for (int max_number_of_chars = 4;
+ max_number_of_chars < kTooManyCharacters;
+ max_number_of_chars *= 2) {
+ biggest_points =
+ FindBestInterval(max_number_of_chars, biggest_points, from, to);
+ }
+ if (biggest_points == 0) return false;
+ return true;
+}
+
+
+// Find the highest-points range between 0 and length_ where the character
+// information is not too vague. 'Too vague' means that there are more
than
+// max_number_of_chars that can occur at this position. Calculates the
number
+// of points as the product of width-of-the-range and
+// probability-of-finding-one-of-the-characters, where the probability is
+// calculated using the frequency distribution of the sample subject
string.
+int BoyerMooreLookahead::FindBestInterval(
+ int max_number_of_chars, int old_biggest_points, int* from, int* to) {
+ int biggest_points = old_biggest_points;
+ static const int kSize = RegExpMacroAssembler::kTableSize;
+ for (int i = 0; i < length_; ) {
+ while (i < length_ && Count(i) > max_number_of_chars) i++;
+ if (i == length_) break;
+ int remembered_from = i;
+ bool union_map[kSize];
+ for (int j = 0; j < kSize; j++) union_map[j] = false;
+ while (i < length_ && Count(i) <= max_number_of_chars) {
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
+ i++;
+ }
+ int frequency = 0;
+ for (int j = 0; j < kSize; j++) {
+ if (union_map[j]) {
+ // Add 1 to the frequency to give a small per-character boost for
+ // the cases where our sampling is not good enough and many
+ // characters have a frequency of zero. This means the frequency
+ // can theoretically be up to 2*kSize though we treat it mostly as
+ // a fraction of kSize.
+ frequency += compiler_->frequency_collator()->Frequency(j) + 1;
+ }
+ }
+ // We use the probability of skipping times the distance we are
skipping to
+ // judge the effectiveness of this. Actually we have a cut-off: By
+ // dividing by 2 we switch off the skipping if the probability of
skipping
+ // is less than 50%. This is because the multibyte mask-and-compare
+ // skipping in quickcheck is more likely to do well on this case.
+ bool in_quickcheck_range = ((i - remembered_from < 4) ||
+ (compiler_->ascii() ? remembered_from <= 4 : remembered_from <=
2));
+ // Called 'probability' but it is only a rough estimate and can
actually
+ // be outside the 0-kSize range.
+ int probability = (in_quickcheck_range ? kSize / 2 : kSize) -
frequency;
+ int points = (i - remembered_from) * probability;
+ if (points > biggest_points) {
+ *from = remembered_from;
+ *to = i - 1;
+ biggest_points = points;
+ }
+ }
+ return biggest_points;
+}
+
+
+// Take all the characters that will not prevent a successful match if they
+// occur in the subject string in the range between min_lookahead and
+// max_lookahead (inclusive) measured from the current position. If the
+// character at max_lookahead offset is not one of these characters, then
we
+// can safely skip forwards by the number of characters in the range.
+int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
+ int max_lookahead,
+ Handle<ByteArray>
boolean_skip_table) {
+ const int kSize = RegExpMacroAssembler::kTableSize;
+
+ const int kSkipArrayEntry = 0;
+ const int kDontSkipArrayEntry = 1;
+
+ for (int i = 0; i < kSize; i++) {
+ boolean_skip_table->set(i, kSkipArrayEntry);
+ }
+ int skip = max_lookahead + 1 - min_lookahead;
+
+ for (int i = max_lookahead; i >= min_lookahead; i--) {
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int j = 0; j < map_length_; j++) {
+ if (map->at(j)) {
+ boolean_skip_table->set(j, kDontSkipArrayEntry);
+ }
+ }
+ }
+
+ return skip;
+}
+
+
+// See comment above on the implementation of GetSkipTable.
+bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm)
{
+ int min_lookahead = 0;
+ int max_lookahead = 0;
+
+ if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return
false;
+
+ bool found_single_character = false;
+ bool abandoned_search_for_single_character = false;
+ int single_character = 0;
+ for (int i = max_lookahead; i >= min_lookahead; i--) {
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int j = 0; j < map_length_; j++) {
+ if (map->at(j)) {
+ if (found_single_character) {
+ found_single_character = false; // Found two.
+ abandoned_search_for_single_character = true;
+ break;
+ } else {
+ found_single_character = true;
+ single_character = j;
+ }
+ }
+ }
+ if (abandoned_search_for_single_character) break;
+ }
+
+ int lookahead_width = max_lookahead + 1 - min_lookahead;
+
+ if (found_single_character && lookahead_width == 1 && max_lookahead < 3)
{
+ // The mask-compare can probably handle this better.
+ return false;
+ }
+
+ if (found_single_character) {
+ Label cont, again;
+ masm->Bind(&again);
+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+ if (max_char_ > map_length_) {
+ ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);
+ masm->CheckCharacterAfterAnd(single_character,
+ RegExpMacroAssembler::kTableMask,
+ &cont);
+ } else {
+ masm->CheckCharacter(single_character, &cont);
+ }
+ masm->AdvanceCurrentPosition(lookahead_width);
+ masm->GoTo(&again);
+ masm->Bind(&cont);
+ return true;
+ }
+
+ Handle<ByteArray> boolean_skip_table =
+ FACTORY->NewByteArray(map_length_, TENURED);
+ int skip_distance = GetSkipTable(
+ min_lookahead, max_lookahead, boolean_skip_table);
+ ASSERT(skip_distance != 0);
+
+ Label cont, again;
+ masm->Bind(&again);
+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+ masm->CheckBitInTable(boolean_skip_table, &cont);
+ masm->AdvanceCurrentPosition(skip_distance);
+ masm->GoTo(&again);
+ masm->Bind(&cont);
+
+ return true;
+}
/* Code generation for choice nodes.
*
@@ -3274,10 +3572,51 @@
int first_normal_choice = greedy_loop ? 1 : 0;
- int preload_characters =
- CalculatePreloadCharacters(compiler,
- current_trace->at_start() ==
Trace::FALSE);
- bool preload_is_current =
+ bool not_at_start = current_trace->at_start() == Trace::FALSE;
+ const int kEatsAtLeastNotYetInitialized = -1;
+ int eats_at_least = kEatsAtLeastNotYetInitialized;
+
+ bool skip_was_emitted = false;
+
+ // More makes code generation slower, less makes V8 benchmark score
lower.
+ const int kMaxLookaheadForBoyerMoore = 8;
+
+ if (!greedy_loop && choice_count == 2) {
+ GuardedAlternative alt1 = alternatives_->at(1);
+ if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
+ RegExpNode* eats_anything_node = alt1.node();
+ if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
+ this) {
+ // At this point we know that we are at a non-greedy loop that
will eat
+ // any character one at a time. Any non-anchored regexp has such a
+ // loop prepended to it in order to find where it starts. We look
for
+ // a pattern of the form ...abc... where we can look 6 characters
ahead
+ // and step forwards 3 if the character is not one of abc. Abc
need
+ // not be atoms, they can be any reasonably limited character
class or
+ // small alternation.
+ ASSERT(trace->is_trivial()); // This is the case on
LoopChoiceNodes.
+ eats_at_least =
+ Min(kMaxLookaheadForBoyerMoore,
+ EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
+ if (eats_at_least >= 1) {
+ BoyerMooreLookahead bm(eats_at_least,
+ RegExpMacroAssembler::kTableSize,
+ compiler);
+ GuardedAlternative alt0 = alternatives_->at(0);
+ alt0.node()->FillInBMInfo(0, &bm, not_at_start);
+ skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);
+ }
+ }
+ }
+ }
+
+ if (eats_at_least == kEatsAtLeastNotYetInitialized) {
+ // Save some time by looking at most one machine word ahead.
+ eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0,
not_at_start);
+ }
+ int preload_characters = CalculatePreloadCharacters(compiler,
eats_at_least);
+
+ bool preload_is_current = !skip_was_emitted &&
(current_trace->characters_preloaded() == preload_characters);
bool preload_has_checked_bounds = preload_is_current;
@@ -5430,6 +5769,76 @@
// not to be.
return kComputeFirstCharacterSetFail;
}
+
+
+void ChoiceNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ ZoneList<GuardedAlternative>* alts = alternatives();
+ for (int i = 0; i < alts->length(); i++) {
+ GuardedAlternative& alt = alts->at(i);
+ if (alt.guards() != NULL && alt.guards()->length() != 0) {
+ bm->SetRest(offset); // Give up trying to fill in info.
+ return;
+ }
+ alt.node()->FillInBMInfo(offset, bm, not_at_start);
+ }
+}
+
+
+void TextNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ if (offset >= bm->length()) return;
+ int max_char = bm->max_char();
+ for (int i = 0; i < elements()->length(); i++) {
+ if (offset >= bm->length()) return;
+ TextElement text = elements()->at(i);
+ if (text.type == TextElement::ATOM) {
+ RegExpAtom* atom = text.data.u_atom;
+ for (int j = 0; j < atom->length(); j++, offset++) {
+ if (offset >= bm->length()) return;
+ uc16 character = atom->data()[j];
+ if (bm->compiler()->ignore_case()) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ int length = GetCaseIndependentLetters(
+ ISOLATE,
+ character,
+ bm->max_char() == String::kMaxAsciiCharCode,
+ chars);
+ for (int j = 0; j < length; j++) {
+ bm->Set(offset, chars[j]);
+ }
+ } else {
+ if (character <= max_char) bm->Set(offset, character);
+ }
+ }
+ } else {
+ ASSERT(text.type == TextElement::CHAR_CLASS);
+ RegExpCharacterClass* char_class = text.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = char_class->ranges();
+ if (char_class->is_negated()) {
+ bm->SetAll(offset);
+ } else {
+ for (int k = 0; k < ranges->length(); k++) {
+ CharacterRange& range = ranges->at(k);
+ if (range.from() > max_char) continue;
+ int to = Min(max_char, static_cast<int>(range.to()));
+ if (to - range.from() >=
BoyerMooreLookahead::kTooManyCharacters) {
+ bm->SetAll(offset);
+ break;
+ }
+ for (int m = range.from(); m <= to; m++) {
+ bm->Set(offset, m);
+ }
+ }
+ }
+ offset++;
+ }
+ }
+ if (offset >= bm->length()) return;
+ on_success()->FillInBMInfo(offset,
+ bm,
+ true); // Not at start after a text node.
+}
int TextNode::ComputeFirstCharacterSet(int budget) {
@@ -5589,15 +5998,30 @@
}
-RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData*
data,
- bool ignore_case,
- bool is_multiline,
- Handle<String>
pattern,
- bool is_ascii) {
+RegExpEngine::CompilationResult RegExpEngine::Compile(
+ RegExpCompileData* data,
+ bool ignore_case,
+ bool is_multiline,
+ Handle<String> pattern,
+ Handle<String> sample_subject,
+ bool is_ascii) {
if ((data->capture_count + 1) * 2 - 1 >
RegExpMacroAssembler::kMaxRegister) {
return IrregexpRegExpTooBig();
}
RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
+
+ // Sample some characters from the middle of the string.
+ static const int kSampleSize = 128;
+
+ FlattenString(sample_subject);
+ int chars_sampled = 0;
+ int half_way = (sample_subject->length() - kSampleSize) / 2;
+ for (int i = Max(0, half_way);
+ i < sample_subject->length() && chars_sampled < kSampleSize;
+ i++, chars_sampled++) {
+ compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
+ }
+
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
0,
=======================================
--- /branches/bleeding_edge/src/jsregexp.h Tue Feb 28 02:32:02 2012
+++ /branches/bleeding_edge/src/jsregexp.h Mon Apr 2 02:38:07 2012
@@ -190,8 +190,10 @@
static String* last_ascii_string_;
static String* two_byte_cached_string_;
- static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii);
- static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool
is_ascii);
+ static bool CompileIrregexp(
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
+ static inline bool EnsureCompiledIrregexp(
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
// Set the subject cache. The previous string buffer is not deleted, so
the
@@ -419,6 +421,90 @@
};
+// Improve the speed that we scan for an initial point where a non-anchored
+// regexp can match by using a Boyer-Moore-like table. This is done by
+// identifying non-greedy non-capturing loops in the nodes that eat any
+// character one at a time. For example in the middle of the regexp
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop
implicitly
+// inserted at the start of any non-anchored regexp.
+//
+// When we have found such a loop we look ahead in the nodes to find the
set of
+// characters that can come at given distances. For example for the regexp
+// /.?foo/ we know that there are at least 3 characters ahead of us, and
the
+// sets of characters that can occur are [any, [f, o], [o]]. We find a
range in
+// the lookahead info where the set of characters is reasonably
constrained. In
+// our example this is from index 1 to 2 (0 is not constrained). We can now
+// look 3 characters ahead and if we don't find one of [f, o] (the union of
+// [f, o] and [o]) then we can skip forwards by the range size (in this
case 2).
+//
+// For Unicode input strings we do the same, but modulo 128.
+//
+// We also look at the first string fed to the regexp and use that to get
a hint
+// of the character frequencies in the inputs. This affects the assessment
of
+// whether the set of characters is 'reasonably constrained'.
+//
+// We also have another lookahead mechanism (called quick check in the
code),
+// which uses a wide load of multiple characters followed by a mask and
compare
+// to determine whether a match is possible at this point.
+class BoyerMooreLookahead {
+ public:
+ BoyerMooreLookahead(int length, int map_length, RegExpCompiler*
compiler);
+
+ int length() { return length_; }
+ int max_char() { return max_char_; }
+ RegExpCompiler* compiler() { return compiler_; }
+
+ static const int kTooManyCharacters = 32;
+
+ int Count(int map_number) {
+ ZoneList<bool>* map = bitmaps_->at(map_number);
+ if (map == NULL) return map_length_;
+ int count = 0;
+ for (int i = 0; i < map_length_; i++) {
+ if (map->at(i)) count++;
+ }
+ return count;
+ }
+
+ void Set(int map_number, int character) {
+ if (character > max_char_) return;
+ ZoneList<bool>* map = bitmaps_->at(map_number);
+ if (map == NULL) return;
+ map->at(character & (map_length_ - 1)) = true;
+ }
+
+ void SetAll(int map_number) {
+ bitmaps_->at(map_number) = NULL;
+ }
+
+ void SetRest(int from_map) {
+ for (int i = from_map; i < length_; i++) SetAll(i);
+ }
+ bool EmitSkipInstructions(RegExpMacroAssembler* masm);
+
+ private:
+ // This is the value obtained by EatsAtLeast. If we do not have at
least this
+ // many characters left in the sample string then the match is bound to
fail.
+ // Therefore it is OK to read a character this far ahead of the current
match
+ // point.
+ int length_;
+ // We conservatively consider all character values modulo this length.
For
+ // ASCII there is no loss of precision, since this has a value of 128.
+ int map_length_;
+ RegExpCompiler* compiler_;
+ // 0x7f for ASCII, 0xffff for UTF-16.
+ int max_char_;
+ ZoneList<ZoneList<bool>*>* bitmaps_;
+
+ int GetSkipTable(int min_lookahead,
+ int max_lookahead,
+ Handle<ByteArray> boolean_skip_table);
+ bool FindWorthwhileInterval(int* from, int* to);
+ int FindBestInterval(
+ int max_number_of_chars, int old_biggest_points, int* from, int* to);
+};
+
+
#define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \
VISIT(Action) \
@@ -635,6 +721,22 @@
bool not_at_start) = 0;
static const int kNodeIsTooComplexForGreedyLoops = -1;
virtual int GreedyLoopTextLength() { return
kNodeIsTooComplexForGreedyLoops; }
+ // Only returns the successor for a text node of length 1 that matches
any
+ // character and that has no guards on it.
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) {
+ return NULL;
+ }
+
+ // Collects information on the possible code units (mod 128) that can
match if
+ // we look forward. This is used for a Boyer-Moore-like string searching
+ // implementation. TODO(erikcorry): This should share more code with
+ // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet.
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ UNREACHABLE();
+ }
+
Label* label() { return &label_; }
// If non-generic code is generated for a node (i.e. the node is not at
the
// start of the trace) then it cannot be reused. This variable sets a
limit
@@ -747,6 +849,10 @@
: on_success_(on_success) { }
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ on_success_->FillInBMInfo(offset, bm, not_at_start);
+ }
private:
RegExpNode* on_success_;
};
@@ -793,6 +899,8 @@
return on_success()->GetQuickCheckDetails(
details, compiler, filled_in, not_at_start);
}
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start);
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return
kNodeIsTooComplexForGreedyLoops; }
@@ -860,6 +968,10 @@
ZoneList<TextElement>* elements() { return elms_; }
void MakeCaseIndependent(bool is_ascii);
virtual int GreedyLoopTextLength();
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler);
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual TextNode* Clone() {
TextNode* result = new TextNode(*this);
result->CalculateOffsets();
@@ -928,6 +1040,8 @@
RegExpCompiler* compiler,
int filled_in,
bool not_at_start);
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
@@ -961,6 +1075,12 @@
bool not_at_start) {
return;
}
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ // Working out the set of characters that a backreference can match is
too
+ // hard, so we just say that any character can match.
+ bm->SetRest(offset);
+ }
virtual BackReferenceNode* Clone() { return new
BackReferenceNode(*this); }
virtual int ComputeFirstCharacterSet(int budget);
@@ -986,6 +1106,11 @@
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ // Returning 0 from EatsAtLeast should ensure we never get here.
+ UNREACHABLE();
+ }
virtual EndNode* Clone() { return new EndNode(*this); }
private:
Action action_;
@@ -1071,6 +1196,8 @@
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
bool being_calculated() { return being_calculated_; }
@@ -1089,7 +1216,7 @@
void GenerateGuard(RegExpMacroAssembler* macro_assembler,
Guard* guard,
Trace* trace);
- int CalculatePreloadCharacters(RegExpCompiler* compiler, bool
not_at_start);
+ int CalculatePreloadCharacters(RegExpCompiler* compiler, int
eats_at_least);
void EmitOutOfLineContinuation(RegExpCompiler* compiler,
Trace* trace,
GuardedAlternative alternative,
@@ -1119,6 +1246,10 @@
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start);
+ }
// For a negative lookahead we don't emit the quick check for the
// alternative that is expected to fail. This is because quick check
code
// starts by loading enough characters for the alternative that takes
fewest
@@ -1146,6 +1277,8 @@
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
+ virtual void FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start);
virtual int ComputeFirstCharacterSet(int budget);
virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
@@ -1458,6 +1591,7 @@
bool ignore_case,
bool multiline,
Handle<String> pattern,
+ Handle<String> sample_subject,
bool is_ascii);
static void DotPrint(const char* label, RegExpNode* node, bool
ignore_case);
=======================================
--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Fri Mar
30 00:43:48 2012
+++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Mon Apr
2 02:38:07 2012
@@ -542,9 +542,13 @@
void RegExpMacroAssemblerX64::CheckCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_equal) {
- __ movl(rax, current_character());
- __ and_(rax, Immediate(mask));
- __ cmpl(rax, Immediate(c));
+ if (c == 0) {
+ __ testl(current_character(), Immediate(mask));
+ } else {
+ __ movl(rax, Immediate(mask));
+ __ and_(rax, current_character());
+ __ cmpl(rax, Immediate(c));
+ }
BranchOrBacktrack(equal, on_equal);
}
@@ -552,9 +556,13 @@
void RegExpMacroAssemblerX64::CheckNotCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label*
on_not_equal) {
- __ movl(rax, current_character());
- __ and_(rax, Immediate(mask));
- __ cmpl(rax, Immediate(c));
+ if (c == 0) {
+ __ testl(current_character(), Immediate(mask));
+ } else {
+ __ movl(rax, Immediate(mask));
+ __ and_(rax, current_character());
+ __ cmpl(rax, Immediate(c));
+ }
BranchOrBacktrack(not_equal, on_not_equal);
}
=======================================
--- /branches/bleeding_edge/test/cctest/test-regexp.cc Mon Feb 20 10:01:21
2012
+++ /branches/bleeding_edge/test/cctest/test-regexp.cc Mon Apr 2 02:38:07
2012
@@ -504,7 +504,10 @@
return NULL;
Handle<String> pattern = isolate->factory()->
NewStringFromUtf8(CStrVector(input));
- RegExpEngine::Compile(&compile_data, false, multiline, pattern,
is_ascii);
+ Handle<String> sample_subject =
+ isolate->factory()->NewStringFromUtf8(CStrVector(""));
+ RegExpEngine::Compile(
+ &compile_data, false, multiline, pattern, sample_subject, is_ascii);
return compile_data.node;
}
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev