[v8-dev] [v8 commit] r2345 - Changed RegExp parser to use a recursive data structure instead of stack-based recursion.

codesite-noreply Fri, 03 Jul 2009 01:31:39 -0700

Author: [email protected]
Date: Fri Jul  3 01:18:35 2009
New Revision: 2345


Modified:
    branches/bleeding_edge/src/ast.h
    branches/bleeding_edge/src/parser.cc
    branches/bleeding_edge/test/cctest/test-regexp.cc

Log:
Changed RegExp parser to use a recursive data structure instead of  
stack-based recursion.
Shouldn't run out of stack space while parsing deeply nested regexps.
Might be a little faster.

Review URL: http://codereview.chromium.org/149069


Modified: branches/bleeding_edge/src/ast.h
==============================================================================
--- branches/bleeding_edge/src/ast.h    (original)
+++ branches/bleeding_edge/src/ast.h    Fri Jul  3 01:18:35 2009
@@ -1575,16 +1575,10 @@
  };


-enum CaptureAvailability {
-  CAPTURE_AVAILABLE,
-  CAPTURE_UNREACHABLE,
-  CAPTURE_PERMANENTLY_UNREACHABLE
-};
-
  class RegExpCapture: public RegExpTree {
   public:
    explicit RegExpCapture(RegExpTree* body, int index)
-      : body_(body), index_(index), available_(CAPTURE_AVAILABLE) { }
+      : body_(body), index_(index) { }
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success);
@@ -1600,16 +1594,11 @@
    virtual int max_match() { return body_->max_match(); }
    RegExpTree* body() { return body_; }
    int index() { return index_; }
-  inline CaptureAvailability available() { return available_; }
-  inline void set_available(CaptureAvailability availability) {
-    available_ = availability;
-  }
    static int StartRegister(int index) { return index * 2; }
    static int EndRegister(int index) { return index * 2 + 1; }
   private:
    RegExpTree* body_;
    int index_;
-  CaptureAvailability available_;
  };



Modified: branches/bleeding_edge/src/parser.cc
==============================================================================
--- branches/bleeding_edge/src/parser.cc        (original)
+++ branches/bleeding_edge/src/parser.cc        Fri Jul  3 01:18:35 2009
@@ -361,7 +361,7 @@
  };

  // Accumulates RegExp atoms and assertions into lists of terms and  
alternatives.
-class RegExpBuilder {
+class RegExpBuilder: public ZoneObject {
   public:
    RegExpBuilder();
    void AddCharacter(uc16 character);
@@ -392,7 +392,10 @@


  RegExpBuilder::RegExpBuilder()
-  : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
+  : pending_empty_(false),
+    characters_(NULL),
+    terms_(),
+    alternatives_()
  #ifdef DEBUG
    , last_added_(ADD_NONE)
  #endif
@@ -594,6 +597,44 @@
    static const int kMaxCaptures = 1 << 16;
    static const uc32 kEndMarker = (1 << 21);
   private:
+  enum SubexpressionType {
+    INITIAL,
+    CAPTURE,  // All positive values represent captures.
+    POSITIVE_LOOKAHEAD,
+    NEGATIVE_LOOKAHEAD,
+    GROUPING
+  };
+
+  class RegExpParserState : public ZoneObject {
+   public:
+    RegExpParserState(RegExpParserState* previous_state,
+                      SubexpressionType group_type,
+                      int disjunction_capture_index)
+        : previous_state_(previous_state),
+          builder_(new RegExpBuilder()),
+          group_type_(group_type),
+          disjunction_capture_index_(disjunction_capture_index) {}
+    // Parser state of containing expression, if any.
+    RegExpParserState* previous_state() { return previous_state_; }
+    bool IsSubexpression() { return previous_state_ != NULL; }
+    // RegExpBuilder building this regexp's AST.
+    RegExpBuilder* builder() { return builder_; }
+    // Type of regexp being parsed (parenthesized group or entire regexp).
+    SubexpressionType group_type() { return group_type_; }
+    // Index in captures array of first capture in this sub-expression, if  
any.
+    // Also the capture index of this sub-expression itself, if group_type
+    // is CAPTURE.
+    int capture_index() { return disjunction_capture_index_; }
+   private:
+    // Linked list implementation of stack of states.
+    RegExpParserState* previous_state_;
+    // Builder for the stored disjunction.
+    RegExpBuilder* builder_;
+    // Stored disjunction type (capture, look-ahead or grouping), if any.
+    SubexpressionType group_type_;
+    // Stored disjunction's capture index (if any).
+    int disjunction_capture_index_;
+  };

    uc32 current() { return current_; }
    bool has_more() { return has_more_; }
@@ -601,7 +642,6 @@
    uc32 Next();
    FlatStringReader* in() { return in_; }
    void ScanForCaptures();
-  bool CaptureAvailable(int index);
    uc32 current_;
    bool has_more_;
    bool multiline_;
@@ -3820,14 +3860,6 @@
  }


-bool RegExpParser::CaptureAvailable(int index) {
-  if (captures_ == NULL) return false;
-  if (index >= captures_->length()) return false;
-  RegExpCapture* capture = captures_->at(index);
-  return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
-}
-
-
  // Disjunction ::
  //   Alternative
  //   Alternative | Disjunction
@@ -3839,24 +3871,60 @@
  //   Atom
  //   Atom Quantifier
  RegExpTree* RegExpParser::ParseDisjunction() {
-  RegExpBuilder builder;
-  int capture_start_index = captures_started();
+  // Used to store current state while parsing subexpressions.
+  RegExpParserState initial_state(NULL, INITIAL, 0);
+  RegExpParserState* stored_state = &initial_state;
+  // Cache the builder in a local variable for quick access.
+  RegExpBuilder* builder = initial_state.builder();
    while (true) {
      switch (current()) {
      case kEndMarker:
-    case ')':
-      return builder.ToRegExp();
-    case '|': {
+      if (stored_state->IsSubexpression()) {
+        // Inside a parenthesized group when hitting end of input.
+        ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+      }
+      ASSERT_EQ(INITIAL, stored_state->group_type());
+      // Parsing completed successfully.
+      return builder->ToRegExp();
+    case ')': {
+      if (!stored_state->IsSubexpression()) {
+        ReportError(CStrVector("Unexpected ')'") CHECK_FAILED);
+      }
+      ASSERT_NE(INITIAL, stored_state->group_type());
+
        Advance();
-      builder.NewAlternative();
-      int capture_new_alt_start_index = captures_started();
-      for (int i = capture_start_index; i < capture_new_alt_start_index;  
i++) {
-        RegExpCapture* capture = captures_->at(i);
-        if (capture->available() == CAPTURE_AVAILABLE) {
-          capture->set_available(CAPTURE_UNREACHABLE);
-        }
+      // End disjunction parsing and convert builder content to new single
+      // regexp atom.
+      RegExpTree* body = builder->ToRegExp();
+
+      int end_capture_index = captures_started();
+
+      int capture_index = stored_state->capture_index();
+      SubexpressionType type = stored_state->group_type();
+
+      // Restore previous state.
+      stored_state = stored_state->previous_state();
+      builder = stored_state->builder();
+
+      // Build result of subexpression.
+      if (type == CAPTURE) {
+        RegExpCapture* capture = new RegExpCapture(body, capture_index);
+        captures_->at(capture_index - 1) = capture;
+        body = capture;
+      } else if (type != GROUPING) {
+        ASSERT(type == POSITIVE_LOOKAHEAD || type == NEGATIVE_LOOKAHEAD);
+        bool is_positive = (type == POSITIVE_LOOKAHEAD);
+        body = new RegExpLookahead(body,
+                                   is_positive,
+                                   end_capture_index - capture_index,
+                                   capture_index);
        }
-      capture_start_index = capture_new_alt_start_index;
+      builder->AddAtom(body);
+      break;
+    }
+    case '|': {
+      Advance();
+      builder->NewAlternative();
        continue;
      }
      case '*':
@@ -3866,10 +3934,10 @@
      case '^': {
        Advance();
        if (multiline_) {
-        builder.AddAssertion(
+        builder->AddAssertion(
              new RegExpAssertion(RegExpAssertion::START_OF_LINE));
        } else {
-        builder.AddAssertion(
+        builder->AddAssertion(
              new RegExpAssertion(RegExpAssertion::START_OF_INPUT));
          set_contains_anchor();
        }
@@ -3880,7 +3948,7 @@
        RegExpAssertion::Type type =
            multiline_ ? RegExpAssertion::END_OF_LINE :
                         RegExpAssertion::END_OF_INPUT;
-      builder.AddAssertion(new RegExpAssertion(type));
+      builder->AddAssertion(new RegExpAssertion(type));
        continue;
      }
      case '.': {
@@ -3889,17 +3957,47 @@
        ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
        CharacterRange::AddClassEscape('.', ranges);
        RegExpTree* atom = new RegExpCharacterClass(ranges, false);
-      builder.AddAtom(atom);
+      builder->AddAtom(atom);
        break;
      }
      case '(': {
-      RegExpTree* atom = ParseGroup(CHECK_FAILED);
-      builder.AddAtom(atom);
+      SubexpressionType type = CAPTURE;
+      Advance();
+      if (current() == '?') {
+        switch (Next()) {
+          case ':':
+            type = GROUPING;
+            break;
+          case '=':
+            type = POSITIVE_LOOKAHEAD;
+            break;
+          case '!':
+            type = NEGATIVE_LOOKAHEAD;
+            break;
+          default:
+            ReportError(CStrVector("Invalid group") CHECK_FAILED);
+            break;
+        }
+        Advance(2);
+      } else {
+        if (captures_ == NULL) {
+          captures_ = new ZoneList<RegExpCapture*>(2);
+        }
+        if (captures_started() >= kMaxCaptures) {
+          ReportError(CStrVector("Too many captures") CHECK_FAILED);
+        }
+        captures_->Add(NULL);
+      }
+      // Store current state and begin new disjunction parsing.
+      stored_state = new RegExpParserState(stored_state,
+                                           type,
+                                           captures_started());
+      builder = stored_state->builder();
        break;
      }
      case '[': {
        RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
-      builder.AddAtom(atom);
+      builder->AddAtom(atom);
        break;
      }
      // Atom ::
@@ -3910,12 +4008,12 @@
          ReportError(CStrVector("\\ at end of pattern") CHECK_FAILED);
        case 'b':
          Advance(2);
-        builder.AddAssertion(
+        builder->AddAssertion(
              new RegExpAssertion(RegExpAssertion::BOUNDARY));
          continue;
        case 'B':
          Advance(2);
-        builder.AddAssertion(
+        builder->AddAssertion(
              new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
          continue;
          // AtomEscape ::
@@ -3929,27 +4027,29 @@
          ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
          CharacterRange::AddClassEscape(c, ranges);
          RegExpTree* atom = new RegExpCharacterClass(ranges, false);
-        builder.AddAtom(atom);
+        builder->AddAtom(atom);
          goto has_read_atom;  // Avoid setting has_character_escapes_.
        }
        case '1': case '2': case '3': case '4': case '5': case '6':
        case '7': case '8': case '9': {
          int index = 0;
          if (ParseBackReferenceIndex(&index)) {
-          if (!CaptureAvailable(index - 1)) {
-            // Prepare to ignore a following quantifier
-            builder.AddEmpty();
+          RegExpCapture* capture = NULL;
+          if (captures_ != NULL && index <= captures_->length()) {
+            capture = captures_->at(index - 1);
+          }
+          if (capture == NULL) {
+            builder->AddEmpty();
              goto has_read_atom;
            }
-          RegExpCapture* capture = captures_->at(index - 1);
            RegExpTree* atom = new RegExpBackReference(capture);
-          builder.AddAtom(atom);
+          builder->AddAtom(atom);
            goto has_read_atom;  // Avoid setting has_character_escapes_.
          }
          uc32 first_digit = Next();
          if (first_digit == '8' || first_digit == '9') {
            // Treat as identity escape
-          builder.AddCharacter(first_digit);
+          builder->AddCharacter(first_digit);
            Advance(2);
            break;
          }
@@ -3958,44 +4058,44 @@
        case '0': {
          Advance();
          uc32 octal = ParseOctalLiteral();
-        builder.AddCharacter(octal);
+        builder->AddCharacter(octal);
          break;
        }
        // ControlEscape :: one of
        //   f n r t v
        case 'f':
          Advance(2);
-        builder.AddCharacter('\f');
+        builder->AddCharacter('\f');
          break;
        case 'n':
          Advance(2);
-        builder.AddCharacter('\n');
+        builder->AddCharacter('\n');
          break;
        case 'r':
          Advance(2);
-        builder.AddCharacter('\r');
+        builder->AddCharacter('\r');
          break;
        case 't':
          Advance(2);
-        builder.AddCharacter('\t');
+        builder->AddCharacter('\t');
          break;
        case 'v':
          Advance(2);
-        builder.AddCharacter('\v');
+        builder->AddCharacter('\v');
          break;
        case 'c': {
          Advance(2);
          uc32 control = ParseControlLetterEscape();
-        builder.AddCharacter(control);
+        builder->AddCharacter(control);
          break;
        }
        case 'x': {
          Advance(2);
          uc32 value;
          if (ParseHexEscape(2, &value)) {
-          builder.AddCharacter(value);
+          builder->AddCharacter(value);
          } else {
-          builder.AddCharacter('x');
+          builder->AddCharacter('x');
          }
          break;
        }
@@ -4003,15 +4103,15 @@
          Advance(2);
          uc32 value;
          if (ParseHexEscape(4, &value)) {
-          builder.AddCharacter(value);
+          builder->AddCharacter(value);
          } else {
-          builder.AddCharacter('u');
+          builder->AddCharacter('u');
          }
          break;
        }
        default:
          // Identity escape.
-        builder.AddCharacter(Next());
+        builder->AddCharacter(Next());
          Advance(2);
          break;
        }
@@ -4024,7 +4124,7 @@
        // fallthrough
      }
      default:
-      builder.AddCharacter(current());
+      builder->AddCharacter(current());
        Advance();
        break;
      }  // end switch(current())
@@ -4071,7 +4171,7 @@
        is_greedy = false;
        Advance();
      }
-    builder.AddQuantifierToAtom(min, max, is_greedy);
+    builder->AddQuantifierToAtom(min, max, is_greedy);
    }
  }

@@ -4379,73 +4479,6 @@
      }
    }
    return 0;
-}
-
-
-RegExpTree* RegExpParser::ParseGroup() {
-  ASSERT_EQ(current(), '(');
-  char type = '(';
-  Advance();
-  if (current() == '?') {
-    switch (Next()) {
-      case ':': case '=': case '!':
-        type = Next();
-        Advance(2);
-        break;
-      default:
-        ReportError(CStrVector("Invalid group") CHECK_FAILED);
-        break;
-    }
-  } else {
-    if (captures_ == NULL) {
-      captures_ = new ZoneList<RegExpCapture*>(2);
-    }
-    if (captures_started() >= kMaxCaptures) {
-      ReportError(CStrVector("Too many captures") CHECK_FAILED);
-    }
-    captures_->Add(NULL);
-  }
-  int capture_index = captures_started();
-  RegExpTree* body = ParseDisjunction(CHECK_FAILED);
-  if (current() != ')') {
-    ReportError(CStrVector("Unterminated group") CHECK_FAILED);
-  }
-  Advance();
-
-  int end_capture_index = captures_started();
-  if (type == '!') {
-    // Captures inside a negative lookahead are never available outside it.
-    for (int i = capture_index; i < end_capture_index; i++) {
-      RegExpCapture* capture = captures_->at(i);
-      ASSERT(capture != NULL);
-      capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
-    }
-  } else {
-    // Captures temporarily unavailable because they are in different
-    // alternatives are all available after the disjunction.
-    for (int i = capture_index; i < end_capture_index; i++) {
-      RegExpCapture* capture = captures_->at(i);
-      ASSERT(capture != NULL);
-      if (capture->available() == CAPTURE_UNREACHABLE) {
-        capture->set_available(CAPTURE_AVAILABLE);
-      }
-    }
-  }
-
-  if (type == '(') {
-    RegExpCapture* capture = new RegExpCapture(body, capture_index);
-    captures_->at(capture_index - 1) = capture;
-    return capture;
-  } else if (type == ':') {
-    return body;
-  } else {
-    ASSERT(type == '=' || type == '!');
-    bool is_positive = (type == '=');
-    return new RegExpLookahead(body,
-                               is_positive,
-                               end_capture_index - capture_index,
-                               capture_index);
-  }
  }



Modified: branches/bleeding_edge/test/cctest/test-regexp.cc
==============================================================================
--- branches/bleeding_edge/test/cctest/test-regexp.cc   (original)
+++ branches/bleeding_edge/test/cctest/test-regexp.cc   Fri Jul  3 01:18:35  
2009
@@ -204,8 +204,8 @@
    CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
    CHECK_PARSE_EQ("(?!a)?a", "'a'");
    CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
-  CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
-  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))");
+  CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
+  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<-  
1))");
    CHECK_PARSE_EQ("[\\0]", "[\\x00]");
    CHECK_PARSE_EQ("[\\11]", "[\\x09]");
    CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

[v8-dev] [v8 commit] r2345 - Changed RegExp parser to use a recursive data structure instead of stack-based recursion.

Reply via email to