Revision: 11445
Author: [email protected]
Date: Thu Apr 26 02:11:19 2012
Log: Regexp: Remove nodes from the regexp that cannot match because
they contain non-ASCII characters and the input string is ASCII.
Remove unused Clone() method.
Review URL: https://chromiumcodereview.appspot.com/10174017
http://code.google.com/p/v8/source/detail?r=11445
Modified:
/branches/bleeding_edge/src/jsregexp.cc
/branches/bleeding_edge/src/jsregexp.h
/branches/bleeding_edge/test/mjsunit/regexp-capture-3.js
=======================================
--- /branches/bleeding_edge/src/jsregexp.cc Tue Apr 24 02:34:13 2012
+++ /branches/bleeding_edge/src/jsregexp.cc Thu Apr 26 02:11:19 2012
@@ -2426,15 +2426,9 @@
QuickCheckDetails::Position* pos =
details->positions(characters_filled_in);
uc16 c = quarks[i];
- if (c > char_mask) {
- // If we expect a non-ASCII character from an ASCII string,
- // there is no way we can match. Not even case independent
- // matching can turn an ASCII character into non-ASCII or
- // vice versa.
- details->set_cannot_match();
- pos->determines_perfectly = false;
- return;
- }
+ // We should already have filtered out nodes that have non-ASCII
+ // characters if we are matching against an ASCII string.
+ ASSERT(c <= char_mask);
if (compiler->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c,
compiler->ascii(),
@@ -2496,11 +2490,9 @@
int first_range = 0;
while (ranges->at(first_range).from() > char_mask) {
first_range++;
- if (first_range == ranges->length()) {
- details->set_cannot_match();
- pos->determines_perfectly = false;
- return;
- }
+ // We should already have filtered out nodes that cannot match
+ // so the first range should be a valid range.
+ ASSERT(first_range != ranges->length());
}
CharacterRange range = ranges->at(first_range);
uc16 from = range.from();
@@ -2629,6 +2621,144 @@
};
+RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ ASSERT(!info()->visited);
+ VisitMarker marker(info());
+ return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
+ RegExpNode* next = on_success_->FilterASCII(depth - 1);
+ if (next == NULL) return set_replacement(NULL);
+ on_success_ = next;
+ return set_replacement(this);
+}
+
+
+RegExpNode* TextNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ ASSERT(!info()->visited);
+ VisitMarker marker(info());
+ int element_count = elms_->length();
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ for (int j = 0; j < quarks.length(); j++) {
+ // We don't need special handling for case independence
+ // because of the rule that case independence cannot make
+ // a non-ASCII character match an ASCII character.
+ if (quarks[j] > String::kMaxAsciiCharCode) {
+ return set_replacement(NULL);
+ }
+ }
+ } else {
+ ASSERT(elm.type == TextElement::CHAR_CLASS);
+ RegExpCharacterClass* cc = elm.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = cc->ranges();
+ if (!CharacterRange::IsCanonical(ranges)) {
+ CharacterRange::Canonicalize(ranges);
+ }
+ // Now they are in order so we only need to look at the first.
+ int range_count = ranges->length();
+ if (cc->is_negated()) {
+ if (range_count != 0 &&
+ ranges->at(0).from() == 0 &&
+ ranges->at(0).to() >= String::kMaxAsciiCharCode) {
+ return set_replacement(NULL);
+ }
+ } else {
+ if (range_count == 0 ||
+ ranges->at(0).from() > String::kMaxAsciiCharCode) {
+ return set_replacement(NULL);
+ }
+ }
+ }
+ }
+ return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+
+ RegExpNode* continue_replacement = continue_node_->FilterASCII(depth -
1);
+ // If we can't continue after the loop then there is no sense in doing
the
+ // loop.
+ if (continue_replacement == NULL) return set_replacement(NULL);
+
+ return ChoiceNode::FilterASCII(depth - 1);
+}
+
+
+RegExpNode* ChoiceNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+ int choice_count = alternatives_->length();
+ int surviving = 0;
+ RegExpNode* survivor = NULL;
+ for (int i = 0; i < choice_count; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
+ ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
+ alternatives_->at(i).set_node(replacement);
+ if (replacement != NULL) {
+ surviving++;
+ survivor = replacement;
+ }
+ }
+ if (surviving < 2) return set_replacement(survivor);
+
+ set_replacement(this);
+ if (surviving == choice_count) {
+ return this;
+ }
+ // Only some of the nodes survived the filtering. We need to rebuild the
+ // alternatives list.
+ ZoneList<GuardedAlternative>* new_alternatives =
+ new ZoneList<GuardedAlternative>(surviving);
+ for (int i = 0; i < choice_count; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ if (alternative.node() != NULL) {
+ new_alternatives->Add(alternative);
+ }
+ }
+ alternatives_ = new_alternatives;
+ return this;
+}
+
+
+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
+ if (info()->replacement_calculated) return replacement();
+ if (depth < 0) return this;
+ if (info()->visited) return this;
+ VisitMarker marker(info());
+ // Alternative 0 is the negative lookahead, alternative 1 is what comes
+ // afterwards.
+ RegExpNode* node = alternatives_->at(1).node();
+ RegExpNode* replacement = node->FilterASCII(depth - 1);
+ if (replacement == NULL) return set_replacement(NULL);
+ alternatives_->at(1).set_node(replacement);
+
+ RegExpNode* neg_node = alternatives_->at(0).node();
+ RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
+ // If the negative lookahead is always going to fail then
+ // we don't need to check it.
+ if (neg_replacement == NULL) return set_replacement(replacement);
+ alternatives_->at(0).set_node(neg_replacement);
+ return set_replacement(this);
+}
+
+
void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
@@ -5690,6 +5820,9 @@
node = loop_node;
}
}
+ if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
+
+ if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
data->node = node;
Analysis analysis(ignore_case, is_ascii);
analysis.EnsureAnalyzed(node);
=======================================
--- /branches/bleeding_edge/src/jsregexp.h Tue Apr 24 02:34:13 2012
+++ /branches/bleeding_edge/src/jsregexp.h Thu Apr 26 02:11:19 2012
@@ -225,6 +225,8 @@
};
+// Represents code units in the range from from_ to to_, both ends are
+// inclusive.
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) { }
@@ -414,7 +416,8 @@
follows_newline_interest(false),
follows_start_interest(false),
at_end(false),
- visited(false) { }
+ visited(false),
+ replacement_calculated(false) { }
// Returns true if the interests and assumptions of this node
// matches the given one.
@@ -464,6 +467,7 @@
bool at_end: 1;
bool visited: 1;
+ bool replacement_calculated: 1;
};
@@ -519,9 +523,12 @@
};
+extern int kUninitializedRegExpNodePlaceHolder;
+
+
class RegExpNode: public ZoneObject {
public:
- RegExpNode() : trace_count_(0) {
+ RegExpNode() : replacement_(NULL), trace_count_(0) {
bm_info_[0] = bm_info_[1] = NULL;
}
virtual ~RegExpNode();
@@ -572,6 +579,22 @@
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
UNREACHABLE();
}
+
+ // If we know that the input is ASCII then there are some nodes that can
+ // never match. This method returns a node that can be substituted for
+ // itself, or NULL if the node can never match.
+ virtual RegExpNode* FilterASCII(int depth) { return this; }
+ // Helper for FilterASCII.
+ RegExpNode* replacement() {
+ ASSERT(info()->replacement_calculated);
+ return replacement_;
+ }
+ RegExpNode* set_replacement(RegExpNode* replacement) {
+ info()->replacement_calculated = true;
+ replacement_ = replacement;
+ return replacement; // For convenience.
+ }
+
// We want to avoid recalculating the lookahead info, so we store it on
the
// node. Only info that is for this node is stored. We can tell that
the
// info is for this node when offset == 0, so the information is
calculated
@@ -596,14 +619,10 @@
protected:
enum LimitResult { DONE, CONTINUE };
+ RegExpNode* replacement_;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
- // Returns a clone of this node initialized using the copy constructor
- // of its concrete class. Note that the node may have to be pre-
- // processed before it is on a usable state.
- virtual RegExpNode* Clone() = 0;
-
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
bm_info_[not_at_start ? 1 : 0] = bm;
}
@@ -655,11 +674,16 @@
: on_success_(on_success) { }
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
+ virtual RegExpNode* FilterASCII(int depth);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start) {
on_success_->FillInBMInfo(offset, bm, not_at_start);
if (offset == 0) set_bm_info(not_at_start, bm);
}
+
+ protected:
+ RegExpNode* FilterSuccessor(int depth);
+
private:
RegExpNode* on_success_;
};
@@ -711,7 +735,6 @@
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return
kNodeIsTooComplexForGreedyLoops; }
- virtual ActionNode* Clone() { return new ActionNode(*this); }
private:
union {
@@ -778,12 +801,8 @@
RegExpCompiler* compiler);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
- virtual TextNode* Clone() {
- TextNode* result = new TextNode(*this);
- result->CalculateOffsets();
- return result;
- }
void CalculateOffsets();
+ virtual RegExpNode* FilterASCII(int depth);
private:
enum TextEmitPassType {
@@ -842,7 +861,6 @@
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
- virtual AssertionNode* Clone() { return new AssertionNode(*this); }
AssertionNodeType type() { return type_; }
void set_type(AssertionNodeType type) { type_ = type; }
@@ -881,7 +899,6 @@
}
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
- virtual BackReferenceNode* Clone() { return new
BackReferenceNode(*this); }
private:
int start_reg_;
@@ -910,7 +927,7 @@
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
- virtual EndNode* Clone() { return new EndNode(*this); }
+
private:
Action action_;
};
@@ -997,13 +1014,13 @@
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
- virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
bool being_calculated() { return being_calculated_; }
bool not_at_start() { return not_at_start_; }
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(int i) { return
true; }
+ virtual RegExpNode* FilterASCII(int depth);
protected:
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@@ -1056,6 +1073,7 @@
// characters, but on a negative lookahead the negative branch did not
take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return
i != 0; }
+ virtual RegExpNode* FilterASCII(int depth);
};
@@ -1078,11 +1096,11 @@
bool not_at_start);
virtual void FillInBMInfo(
int offset, BoyerMooreLookahead* bm, bool not_at_start);
- virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
virtual void Accept(NodeVisitor* visitor);
+ virtual RegExpNode* FilterASCII(int depth);
private:
// AddAlternative is made private for loop nodes because alternatives
=======================================
--- /branches/bleeding_edge/test/mjsunit/regexp-capture-3.js Wed Apr 25
04:35:32 2012
+++ /branches/bleeding_edge/test/mjsunit/regexp-capture-3.js Thu Apr 26
02:11:19 2012
@@ -156,3 +156,34 @@
a = "foo bar baz".replace(/^|bar/g, "*");
assertEquals("*foo * baz", a);
+
+// We test FilterASCII using regexps that will backtrack forever. Since
+// a regexp with a non-ASCII character in it can never match an ASCII
+// string we can test that the relevant node is removed by verifying that
+// there is no hang.
+function NoHang(re) {
+ print(re);
+ "This is an ASCII string that could take forever".match(re);
+}
+
+
+NoHang(/(((.*)*)*x)å/); // Continuation after loop is filtered, so is
loop.
+NoHang(/(((.*)*)*å)foo/); // Body of loop filtered.
+NoHang(/å(((.*)*)*x)/); // Everything after a filtered character is
filtered.
+NoHang(/(((.*)*)*x)å/); // Everything before a filtered character is
filtered.
+NoHang(/[æøå](((.*)*)*x)/); // Everything after a filtered class is
filtered.
+NoHang(/(((.*)*)*x)[æøå]/); // Everything before a filtered class is
filtered.
+NoHang(/[^\x00-\x7f](((.*)*)*x)/); // After negated class.
+NoHang(/(((.*)*)*x)[^\x00-\x7f]/); // Before negated class.
+NoHang(/(?!(((.*)*)*x)å)foo/); // Negative lookahead is filtered.
+NoHang(/(?!(((.*)*)*x))å/); // Continuation branch of negative lookahead.
+NoHang(/(?=(((.*)*)*x)å)foo/); // Positive lookahead is filtered.
+NoHang(/(?=(((.*)*)*x))å/); // Continuation branch of positive lookahead.
+NoHang(/(?=å)(((.*)*)*x)/); // Positive lookahead also prunes
continuation.
+NoHang(/(æ|ø|å)(((.*)*)*x)/); // All branches of alternation are filtered.
+NoHang(/(a|b|(((.*)*)*x))å/); // 1 out of 3 branches pruned.
+NoHang(/(a|(((.*)*)*x)ø|(((.*)*)*x)å)/); // 2 out of 3 branches pruned.
+
+var s = "Don't prune based on a repetition of length 0";
+assertEquals(null, s.match(/å{1,1}prune/));
+assertEquals("prune", (s.match(/å{0,0}prune/)[0]));
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev