Revision: 11445
Author:   [email protected]
Date:     Thu Apr 26 02:11:19 2012
Log:      Regexp: Remove nodes from the regexp that cannot match because
they contain non-ASCII characters and the input string is ASCII.
Remove unused Clone() method.
Review URL: https://chromiumcodereview.appspot.com/10174017
http://code.google.com/p/v8/source/detail?r=11445

Modified:
 /branches/bleeding_edge/src/jsregexp.cc
 /branches/bleeding_edge/src/jsregexp.h
 /branches/bleeding_edge/test/mjsunit/regexp-capture-3.js

=======================================
--- /branches/bleeding_edge/src/jsregexp.cc     Tue Apr 24 02:34:13 2012
+++ /branches/bleeding_edge/src/jsregexp.cc     Thu Apr 26 02:11:19 2012
@@ -2426,15 +2426,9 @@
         QuickCheckDetails::Position* pos =
             details->positions(characters_filled_in);
         uc16 c = quarks[i];
-        if (c > char_mask) {
-          // If we expect a non-ASCII character from an ASCII string,
-          // there is no way we can match. Not even case independent
-          // matching can turn an ASCII character into non-ASCII or
-          // vice versa.
-          details->set_cannot_match();
-          pos->determines_perfectly = false;
-          return;
-        }
+        // We should already have filtered out nodes that have non-ASCII
+        // characters if we are matching against an ASCII string.
+        ASSERT(c <= char_mask);
         if (compiler->ignore_case()) {
           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
@@ -2496,11 +2490,9 @@
         int first_range = 0;
         while (ranges->at(first_range).from() > char_mask) {
           first_range++;
-          if (first_range == ranges->length()) {
-            details->set_cannot_match();
-            pos->determines_perfectly = false;
-            return;
-          }
+          // We should already have filtered out nodes that cannot match
+          // so the first range should be a valid range.
+          ASSERT(first_range != ranges->length());
         }
         CharacterRange range = ranges->at(first_range);
         uc16 from = range.from();
@@ -2629,6 +2621,144 @@
 };


+RegExpNode* SeqRegExpNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  ASSERT(!info()->visited);
+  VisitMarker marker(info());
+  return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
+  RegExpNode* next = on_success_->FilterASCII(depth - 1);
+  if (next == NULL) return set_replacement(NULL);
+  on_success_ = next;
+  return set_replacement(this);
+}
+
+
+RegExpNode* TextNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  ASSERT(!info()->visited);
+  VisitMarker marker(info());
+  int element_count = elms_->length();
+  for (int i = 0; i < element_count; i++) {
+    TextElement elm = elms_->at(i);
+    if (elm.type == TextElement::ATOM) {
+      Vector<const uc16> quarks = elm.data.u_atom->data();
+      for (int j = 0; j < quarks.length(); j++) {
+        // We don't need special handling for case independence
+        // because of the rule that case independence cannot make
+        // a non-ASCII character match an ASCII character.
+        if (quarks[j] > String::kMaxAsciiCharCode) {
+          return set_replacement(NULL);
+        }
+      }
+    } else {
+      ASSERT(elm.type == TextElement::CHAR_CLASS);
+      RegExpCharacterClass* cc = elm.data.u_char_class;
+      ZoneList<CharacterRange>* ranges = cc->ranges();
+      if (!CharacterRange::IsCanonical(ranges)) {
+        CharacterRange::Canonicalize(ranges);
+      }
+      // Now they are in order so we only need to look at the first.
+      int range_count = ranges->length();
+      if (cc->is_negated()) {
+        if (range_count != 0 &&
+            ranges->at(0).from() == 0 &&
+            ranges->at(0).to() >= String::kMaxAsciiCharCode) {
+          return set_replacement(NULL);
+        }
+      } else {
+        if (range_count == 0 ||
+            ranges->at(0).from() > String::kMaxAsciiCharCode) {
+          return set_replacement(NULL);
+        }
+      }
+    }
+  }
+  return FilterSuccessor(depth - 1);
+}
+
+
+RegExpNode* LoopChoiceNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  if (info()->visited) return this;
+  VisitMarker marker(info());
+
+ RegExpNode* continue_replacement = continue_node_->FilterASCII(depth - 1); + // If we can't continue after the loop then there is no sense in doing the
+  // loop.
+  if (continue_replacement == NULL) return set_replacement(NULL);
+
+  return ChoiceNode::FilterASCII(depth - 1);
+}
+
+
+RegExpNode* ChoiceNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  if (info()->visited) return this;
+  VisitMarker marker(info());
+  int choice_count = alternatives_->length();
+  int surviving = 0;
+  RegExpNode* survivor = NULL;
+  for (int i = 0; i < choice_count; i++) {
+    GuardedAlternative alternative = alternatives_->at(i);
+    RegExpNode* replacement = alternative.node()->FilterASCII(depth - 1);
+    ASSERT(replacement != this);  // No missing EMPTY_MATCH_CHECK.
+    alternatives_->at(i).set_node(replacement);
+    if (replacement != NULL) {
+      surviving++;
+      survivor = replacement;
+    }
+  }
+  if (surviving < 2) return set_replacement(survivor);
+
+  set_replacement(this);
+  if (surviving == choice_count) {
+    return this;
+  }
+  // Only some of the nodes survived the filtering.  We need to rebuild the
+  // alternatives list.
+  ZoneList<GuardedAlternative>* new_alternatives =
+      new ZoneList<GuardedAlternative>(surviving);
+  for (int i = 0; i < choice_count; i++) {
+    GuardedAlternative alternative = alternatives_->at(i);
+    if (alternative.node() != NULL) {
+      new_alternatives->Add(alternative);
+    }
+  }
+  alternatives_ = new_alternatives;
+  return this;
+}
+
+
+RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth) {
+  if (info()->replacement_calculated) return replacement();
+  if (depth < 0) return this;
+  if (info()->visited) return this;
+  VisitMarker marker(info());
+  // Alternative 0 is the negative lookahead, alternative 1 is what comes
+  // afterwards.
+  RegExpNode* node = alternatives_->at(1).node();
+  RegExpNode* replacement = node->FilterASCII(depth - 1);
+  if (replacement == NULL) return set_replacement(NULL);
+  alternatives_->at(1).set_node(replacement);
+
+  RegExpNode* neg_node = alternatives_->at(0).node();
+  RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1);
+  // If the negative lookahead is always going to fail then
+  // we don't need to check it.
+  if (neg_replacement == NULL) return set_replacement(replacement);
+  alternatives_->at(0).set_node(neg_replacement);
+  return set_replacement(this);
+}
+
+
 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
                                           RegExpCompiler* compiler,
                                           int characters_filled_in,
@@ -5690,6 +5820,9 @@
       node = loop_node;
     }
   }
+  if (is_ascii) node = node->FilterASCII(RegExpCompiler::kMaxRecursion);
+
+  if (node == NULL) node = new EndNode(EndNode::BACKTRACK);
   data->node = node;
   Analysis analysis(ignore_case, is_ascii);
   analysis.EnsureAnalyzed(node);
=======================================
--- /branches/bleeding_edge/src/jsregexp.h      Tue Apr 24 02:34:13 2012
+++ /branches/bleeding_edge/src/jsregexp.h      Thu Apr 26 02:11:19 2012
@@ -225,6 +225,8 @@
 };


+// Represents code units in the range from from_ to to_, both ends are
+// inclusive.
 class CharacterRange {
  public:
   CharacterRange() : from_(0), to_(0) { }
@@ -414,7 +416,8 @@
         follows_newline_interest(false),
         follows_start_interest(false),
         at_end(false),
-        visited(false) { }
+        visited(false),
+        replacement_calculated(false) { }

   // Returns true if the interests and assumptions of this node
   // matches the given one.
@@ -464,6 +467,7 @@

   bool at_end: 1;
   bool visited: 1;
+  bool replacement_calculated: 1;
 };


@@ -519,9 +523,12 @@
 };


+extern int kUninitializedRegExpNodePlaceHolder;
+
+
 class RegExpNode: public ZoneObject {
  public:
-  RegExpNode() : trace_count_(0) {
+  RegExpNode() : replacement_(NULL), trace_count_(0) {
     bm_info_[0] = bm_info_[1] = NULL;
   }
   virtual ~RegExpNode();
@@ -572,6 +579,22 @@
       int offset, BoyerMooreLookahead* bm, bool not_at_start) {
     UNREACHABLE();
   }
+
+  // If we know that the input is ASCII then there are some nodes that can
+  // never match.  This method returns a node that can be substituted for
+  // itself, or NULL if the node can never match.
+  virtual RegExpNode* FilterASCII(int depth) { return this; }
+  // Helper for FilterASCII.
+  RegExpNode* replacement() {
+    ASSERT(info()->replacement_calculated);
+    return replacement_;
+  }
+  RegExpNode* set_replacement(RegExpNode* replacement) {
+    info()->replacement_calculated = true;
+    replacement_ =  replacement;
+    return replacement;  // For convenience.
+  }
+
// We want to avoid recalculating the lookahead info, so we store it on the // node. Only info that is for this node is stored. We can tell that the // info is for this node when offset == 0, so the information is calculated
@@ -596,14 +619,10 @@

  protected:
   enum LimitResult { DONE, CONTINUE };
+  RegExpNode* replacement_;

   LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);

-  // Returns a clone of this node initialized using the copy constructor
-  // of its concrete class.  Note that the node may have to be pre-
-  // processed before it is on a usable state.
-  virtual RegExpNode* Clone() = 0;
-
   void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
     bm_info_[not_at_start ? 1 : 0] = bm;
   }
@@ -655,11 +674,16 @@
       : on_success_(on_success) { }
   RegExpNode* on_success() { return on_success_; }
   void set_on_success(RegExpNode* node) { on_success_ = node; }
+  virtual RegExpNode* FilterASCII(int depth);
   virtual void FillInBMInfo(
       int offset, BoyerMooreLookahead* bm, bool not_at_start) {
     on_success_->FillInBMInfo(offset, bm, not_at_start);
     if (offset == 0) set_bm_info(not_at_start, bm);
   }
+
+ protected:
+  RegExpNode* FilterSuccessor(int depth);
+
  private:
   RegExpNode* on_success_;
 };
@@ -711,7 +735,6 @@
   Type type() { return type_; }
   // TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
-  virtual ActionNode* Clone() { return new ActionNode(*this); }

  private:
   union {
@@ -778,12 +801,8 @@
       RegExpCompiler* compiler);
   virtual void FillInBMInfo(
       int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual TextNode* Clone() {
-    TextNode* result = new TextNode(*this);
-    result->CalculateOffsets();
-    return result;
-  }
   void CalculateOffsets();
+  virtual RegExpNode* FilterASCII(int depth);

  private:
   enum TextEmitPassType {
@@ -842,7 +861,6 @@
                                     bool not_at_start);
   virtual void FillInBMInfo(
       int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual AssertionNode* Clone() { return new AssertionNode(*this); }
   AssertionNodeType type() { return type_; }
   void set_type(AssertionNodeType type) { type_ = type; }

@@ -881,7 +899,6 @@
   }
   virtual void FillInBMInfo(
       int offset, BoyerMooreLookahead* bm, bool not_at_start);
- virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }

  private:
   int start_reg_;
@@ -910,7 +927,7 @@
     // Returning 0 from EatsAtLeast should ensure we never get here.
     UNREACHABLE();
   }
-  virtual EndNode* Clone() { return new EndNode(*this); }
+
  private:
   Action action_;
 };
@@ -997,13 +1014,13 @@
                                     bool not_at_start);
   virtual void FillInBMInfo(
       int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }

   bool being_calculated() { return being_calculated_; }
   bool not_at_start() { return not_at_start_; }
   void set_not_at_start() { not_at_start_ = true; }
   void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
+  virtual RegExpNode* FilterASCII(int depth);

  protected:
   int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@@ -1056,6 +1073,7 @@
// characters, but on a negative lookahead the negative branch did not take
   // part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
+  virtual RegExpNode* FilterASCII(int depth);
 };


@@ -1078,11 +1096,11 @@
                                     bool not_at_start);
   virtual void FillInBMInfo(
       int offset, BoyerMooreLookahead* bm, bool not_at_start);
-  virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
   RegExpNode* loop_node() { return loop_node_; }
   RegExpNode* continue_node() { return continue_node_; }
   bool body_can_be_zero_length() { return body_can_be_zero_length_; }
   virtual void Accept(NodeVisitor* visitor);
+  virtual RegExpNode* FilterASCII(int depth);

  private:
   // AddAlternative is made private for loop nodes because alternatives
=======================================
--- /branches/bleeding_edge/test/mjsunit/regexp-capture-3.js Wed Apr 25 04:35:32 2012 +++ /branches/bleeding_edge/test/mjsunit/regexp-capture-3.js Thu Apr 26 02:11:19 2012
@@ -156,3 +156,34 @@

 a = "foo bar baz".replace(/^|bar/g, "*");
 assertEquals("*foo * baz", a);
+
+// We test FilterASCII using regexps that will backtrack forever.  Since
+// a regexp with a non-ASCII character in it can never match an ASCII
+// string we can test that the relevant node is removed by verifying that
+// there is no hang.
+function NoHang(re) {
+  print(re);
+  "This is an ASCII string that could take forever".match(re);
+}
+
+
+NoHang(/(((.*)*)*x)å/); // Continuation after loop is filtered, so is loop.
+NoHang(/(((.*)*)*å)foo/);  // Body of loop filtered.
+NoHang(/å(((.*)*)*x)/); // Everything after a filtered character is filtered. +NoHang(/(((.*)*)*x)å/); // Everything before a filtered character is filtered. +NoHang(/[æøå](((.*)*)*x)/); // Everything after a filtered class is filtered. +NoHang(/(((.*)*)*x)[æøå]/); // Everything before a filtered class is filtered.
+NoHang(/[^\x00-\x7f](((.*)*)*x)/);   // After negated class.
+NoHang(/(((.*)*)*x)[^\x00-\x7f]/);   // Before negated class.
+NoHang(/(?!(((.*)*)*x)å)foo/);  // Negative lookahead is filtered.
+NoHang(/(?!(((.*)*)*x))å/);  // Continuation branch of negative lookahead.
+NoHang(/(?=(((.*)*)*x)å)foo/);  // Positive lookahead is filtered.
+NoHang(/(?=(((.*)*)*x))å/);  // Continuation branch of positive lookahead.
+NoHang(/(?=å)(((.*)*)*x)/); // Positive lookahead also prunes continuation.
+NoHang(/(æ|ø|å)(((.*)*)*x)/);  // All branches of alternation are filtered.
+NoHang(/(a|b|(((.*)*)*x))å/);  // 1 out of 3 branches pruned.
+NoHang(/(a|(((.*)*)*x)ø|(((.*)*)*x)å)/);  // 2 out of 3 branches pruned.
+
+var s = "Don't prune based on a repetition of length 0";
+assertEquals(null, s.match(/å{1,1}prune/));
+assertEquals("prune", (s.match(/å{0,0}prune/)[0]));

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to