Author: [EMAIL PROTECTED]
Date: Wed Nov 19 22:29:34 2008
New Revision: 796

Modified:
    branches/experimental/regexp2000/src/jsregexp.cc
    branches/experimental/regexp2000/src/parser.cc
    branches/experimental/regexp2000/test/cctest/test-regexp.cc

Log:
Fixed issues 156, 157, 158 and disabled error reporting on malformed
quantifier prefixes ({...}).


Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc    (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc    Wed Nov 19 22:29:34  
2008
@@ -1746,6 +1746,12 @@
  };


+static const int kLineTerminatorRangeCount = 6;
+static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
+  0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
+};
+
+
  static void AddClass(const uc16* elmv,
                       int elmc,
                       ZoneList<CharacterRange>* ranges) {
@@ -1794,6 +1800,14 @@
        AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
        break;
      case '.':
+      AddClassNegated(kLineTerminatorRanges,
+                      kLineTerminatorRangeCount,
+                      ranges);
+      break;
+    // This is not a character range as defined by the spec but a
+    // convenient shorthand for a character class that matches any
+    // character.
+    case '*':
        ranges->Add(CharacterRange::Everything());
        break;
      default:
@@ -2211,7 +2225,7 @@
    RegExpNode* node = RegExpQuantifier::ToNode(0,
                                                RegExpQuantifier::kInfinity,
                                                false,
-                                              new  
RegExpCharacterClass('.'),
+                                              new  
RegExpCharacterClass('*'),
                                                &compiler,
                                                captured_body,
                                                compiler.backtrack());

Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc      (original)
+++ branches/experimental/regexp2000/src/parser.cc      Wed Nov 19 22:29:34 2008
@@ -320,7 +320,7 @@
   private:
    void FlushCharacters();
    void FlushText();
-  bool FlushTerms();
+  void FlushTerms();
    bool pending_empty_;
    ZoneList<uc16>* characters_;
    BufferedZoneList<RegExpTree, 2> terms_;
@@ -410,20 +410,17 @@


  void RegExpBuilder::NewAlternative() {
-  if (!FlushTerms()) {
-    alternatives_.Add(RegExpEmpty::GetInstance());
-  }
+  FlushTerms();
  }


-bool RegExpBuilder::FlushTerms() {
+void RegExpBuilder::FlushTerms() {
    FlushText();
    int num_terms = terms_.length();
-  if (num_terms == 0) {
-    return false;
-  }
    RegExpTree* alternative;
-  if (num_terms == 1) {
+  if (num_terms == 0) {
+    alternative = RegExpEmpty::GetInstance();
+  } else if (num_terms == 1) {
      alternative = terms_.last();
    } else {
      alternative = new RegExpAlternative(terms_.GetList());
@@ -431,7 +428,6 @@
    alternatives_.Add(alternative);
    terms_.Clear();
    LAST(ADD_NONE);
-  return true;
  }


@@ -506,7 +502,7 @@

    // Parses a {...,...} quantifier and stores the range in the given
    // out parameters.
-  void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok);
+  bool ParseIntervalQuantifier(int* min_out, int* max_out);

    // Parses and returns a single escaped character.  The character
    // must not be 'b' or 'B' since they are usually handle specially.
@@ -538,7 +534,7 @@
    int captures_started() { return captures_ == NULL ? 0 :  
captures_->length(); }
    int position() { return next_pos_ - 1; }

-  static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
+  static const uc32 kEndMarker = (1 << 21);
   private:
    uc32 current() { return current_; }
    bool has_more() { return has_more_; }
@@ -3602,8 +3598,7 @@
      case '*':
      case '+':
      case '?':
-    case '{':
-      ReportError(CStrVector("Nothing to repeat."), CHECK_OK);
+      ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
      case '^': {
        Advance();
        RegExpAssertion::Type type =
@@ -3754,13 +3749,20 @@
        }
        has_character_escapes_ = true;
        break;
+    case '{': {
+      int dummy;
+      if (ParseIntervalQuantifier(&dummy, &dummy)) {
+        ReportError(CStrVector("Nothing to repeat"), CHECK_OK);
+      }
+      // fallthrough
+    }
      default:
        builder.AddCharacter(current());
        Advance();
        break;
      }  // end switch(current())

-    has_read_atom:
+   has_read_atom:
      int min;
      int max;
      switch (current()) {
@@ -3785,8 +3787,11 @@
        Advance();
        break;
      case '{':
-      ParseIntervalQuantifier(&min, &max, CHECK_OK);
-      break;
+      if (ParseIntervalQuantifier(&min, &max)) {
+        break;
+      } else {
+        continue;
+      }
      default:
        continue;
      }
@@ -3876,19 +3881,14 @@
  //   { DecimalDigits }
  //   { DecimalDigits , }
  //   { DecimalDigits , DecimalDigits }
-void* RegExpParser::ParseIntervalQuantifier(int* min_out,
-                                            int* max_out,
-                                            bool* ok) {
+bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
    ASSERT_EQ(current(), '{');
-  static const char* kInvalidQuantifier = "Invalid quantifier";
+  int start = position();
    Advance();
    int min = 0;
    if (!IsDecimalDigit(current())) {
-    // JSC allows {} and {,} as quantifiers (and { and } and all
-    // sorts of crazy stuff) but my puny human brain has been unable
-    // to figure out what they mean exactly, if anything.  For now
-    // we follow the spec and report a syntax error.
-    ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+    Reset(start);
+    return false;
    }
    while (IsDecimalDigit(current())) {
      min = 10 * min + (current() - '0');
@@ -3909,16 +3909,18 @@
          Advance();
        }
        if (current() != '}') {
-        ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+        Reset(start);
+        return false;
        }
        Advance();
      }
    } else {
-    ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
+    Reset(start);
+    return false;
    }
    *min_out = min;
    *max_out = max;
-  return NULL;
+  return true;
  }


@@ -4152,7 +4154,11 @@
      if (!is_char_class) {
        if (current() == '-') {
          Advance();
-        if (current() == ']') {
+        if (current() == kEndMarker) {
+          // If we reach the end we break out of the loop and let the
+          // following code report an error.
+          break;
+        } else if (current() == ']') {
            ranges->Add(first);
            ranges->Add(CharacterRange::Singleton('-'));
            break;

Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Wed Nov 19  
22:29:34 2008
@@ -219,10 +219,30 @@
    CHECK_ESCAPES("\\1112", true);
    CHECK_ESCAPES("\\0", true);
    CHECK_ESCAPES("(a)\\1", false);
+
+  CHECK_PARSE_EQ("a{}", "'a{}'");
+  CHECK_PARSE_EQ("a{,}", "'a{,}'");
+  CHECK_PARSE_EQ("a{", "'a{'");
+  CHECK_PARSE_EQ("a{z}", "'a{z}'");
+  CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
+  CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
+  CHECK_PARSE_EQ("a{12,", "'a{12,'");
+  CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
+  CHECK_PARSE_EQ("{}", "'{}'");
+  CHECK_PARSE_EQ("{,}", "'{,}'");
+  CHECK_PARSE_EQ("{", "'{'");
+  CHECK_PARSE_EQ("{z}", "'{z}'");
+  CHECK_PARSE_EQ("{1z}", "'{1z}'");
+  CHECK_PARSE_EQ("{12z}", "'{12z}'");
+  CHECK_PARSE_EQ("{12,", "'{12,'");
+  CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
  }

  TEST(ParserRegression) {
    CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
+  CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
+  CHECK_PARSE_EQ("{", "'{'");
+  CHECK_PARSE_EQ("a|", "(| 'a' %)");
  }

  static void ExpectError(const char* input,
@@ -243,15 +263,6 @@
    V8::Initialize(NULL);
    const char* kEndBackslash = "\\ at end of pattern";
    ExpectError("\\", kEndBackslash);
-  const char* kInvalidQuantifier = "Invalid quantifier";
-  ExpectError("a{}", kInvalidQuantifier);
-  ExpectError("a{,}", kInvalidQuantifier);
-  ExpectError("a{", kInvalidQuantifier);
-  ExpectError("a{z}", kInvalidQuantifier);
-  ExpectError("a{1z}", kInvalidQuantifier);
-  ExpectError("a{12z}", kInvalidQuantifier);
-  ExpectError("a{12,", kInvalidQuantifier);
-  ExpectError("a{12,3b", kInvalidQuantifier);
    const char* kUnterminatedGroup = "Unterminated group";
    ExpectError("(foo", kUnterminatedGroup);
    const char* kInvalidGroup = "Invalid group";
@@ -263,6 +274,13 @@
    ExpectError("[a-\\w]", kIllegalCharacterClass);
    const char* kEndControl = "\\c at end of pattern";
    ExpectError("\\c", kEndControl);
+  static char* kNothingToRepeat = "Nothing to repeat";
+  ExpectError("*", kNothingToRepeat);
+  ExpectError("?", kNothingToRepeat);
+  ExpectError("+", kNothingToRepeat);
+  ExpectError("{1}", kNothingToRepeat);
+  ExpectError("{1,2}", kNothingToRepeat);
+  ExpectError("{1,}", kNothingToRepeat);
  }


@@ -313,7 +331,13 @@


  static bool Dot(uc16 c) {
-  return true;
+  switch (c) {
+    //   CR           LF           LS           PS
+    case 0x000A: case 0x000D: case 0x2028: case 0x2029:
+      return false;
+    default:
+      return true;
+  }
  }



--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to