Revision: 5911
Author: [email protected]
Date: Thu Dec  2 00:02:37 2010
Log: Change RegExp syntax to fail on invalid ranges like [\d-x], [x-\d] and [\d-\d].

The previous behavior was to treat the "-" as verbatim if the range was invalid. This change matches the JSC changeset http://trac.webkit.org/changeset/72813/

Review URL: http://codereview.chromium.org/5464001
http://code.google.com/p/v8/source/detail?r=5911

Modified:
 /branches/bleeding_edge/src/parser.cc
 /branches/bleeding_edge/test/cctest/test-regexp.cc
 /branches/bleeding_edge/test/mjsunit/regexp.js
 /branches/bleeding_edge/test/mjsunit/third_party/regexp-pcre.js
 /branches/bleeding_edge/test/mozilla/mozilla.status

=======================================
--- /branches/bleeding_edge/src/parser.cc       Mon Nov 29 05:24:37 2010
+++ /branches/bleeding_edge/src/parser.cc       Thu Dec  2 00:02:37 2010
@@ -4404,6 +4404,7 @@
 RegExpTree* RegExpParser::ParseCharacterClass() {
   static const char* kUnterminated = "Unterminated character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
+  static const char* kInvalidRange = "Invalid character range";

   ASSERT_EQ(current(), '[');
   Advance();
@@ -4412,12 +4413,28 @@
     is_negated = true;
     Advance();
   }
+  // A CharacterClass is a sequence of single characters, character class
+  // escapes or ranges. Ranges are on the form "x-y" where x and y are
+  // single characters (and not character class escapes like \s).
+  // A "-" may occur at the start or end of the character class (just after
+  // "[" or "[^", or just before "]") without being considered part of a
+  // range. A "-" may also appear as the beginning or end of a range.
+  // I.e., [--+] is valid, so is [!--].
+
   ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
   while (has_more() && current() != ']') {
     uc16 char_class = 0;
     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
     if (char_class) {
       CharacterRange::AddClassEscape(char_class, ranges);
+      if (current() == '-') {
+        Advance();
+        ranges->Add(CharacterRange::Singleton('-'));
+        if (current() != ']') {
+          ReportError(CStrVector(kInvalidRange) CHECK_FAILED);
+        }
+        break;
+      }
       continue;
     }
     if (current() == '-') {
@@ -4433,10 +4450,7 @@
       }
       CharacterRange next = ParseClassAtom(&char_class CHECK_FAILED);
       if (char_class) {
-        ranges->Add(first);
-        ranges->Add(CharacterRange::Singleton('-'));
-        CharacterRange::AddClassEscape(char_class, ranges);
-        continue;
+        ReportError(CStrVector(kInvalidRange) CHECK_FAILED);
       }
       if (first.from() > next.to()) {
         return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
=======================================
--- /branches/bleeding_edge/test/cctest/test-regexp.cc Wed Oct 27 05:33:48 2010 +++ /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Dec 2 00:02:37 2010
@@ -173,9 +173,6 @@
   CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
   CHECK_PARSE_EQ("[\\d]", "[0-9]");
   CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
-  CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
-  CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
-  CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
   CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
                  "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
   CHECK_PARSE_EQ("\\c!", "'c!'");
=======================================
--- /branches/bleeding_edge/test/mjsunit/regexp.js      Tue Oct 19 07:00:01 2010
+++ /branches/bleeding_edge/test/mjsunit/regexp.js      Thu Dec  2 00:02:37 2010
@@ -110,44 +110,6 @@
 assertFalse(re.test("\x03]"));  // I.e., read as \cc


-// Test that we handle \s and \S correctly inside some bizarre
-// character classes.
-re = /[\s-:]/;
-assertTrue(re.test('-'));
-assertTrue(re.test(':'));
-assertTrue(re.test(' '));
-assertTrue(re.test('\t'));
-assertTrue(re.test('\n'));
-assertFalse(re.test('a'));
-assertFalse(re.test('Z'));
-
-re = /[\S-:]/;
-assertTrue(re.test('-'));
-assertTrue(re.test(':'));
-assertFalse(re.test(' '));
-assertFalse(re.test('\t'));
-assertFalse(re.test('\n'));
-assertTrue(re.test('a'));
-assertTrue(re.test('Z'));
-
-re = /[^\s-:]/;
-assertFalse(re.test('-'));
-assertFalse(re.test(':'));
-assertFalse(re.test(' '));
-assertFalse(re.test('\t'));
-assertFalse(re.test('\n'));
-assertTrue(re.test('a'));
-assertTrue(re.test('Z'));
-
-re = /[^\S-:]/;
-assertFalse(re.test('-'));
-assertFalse(re.test(':'));
-assertTrue(re.test(' '));
-assertTrue(re.test('\t'));
-assertTrue(re.test('\n'));
-assertFalse(re.test('a'));
-assertFalse(re.test('Z'));
-
 re = /[\s]/;
 assertFalse(re.test('-'));
 assertFalse(re.test(':'));
@@ -647,3 +609,47 @@
 assertEquals(["bc"], re.exec("zimzomzumbc"));
 assertFalse(re.test("c"));
 assertFalse(re.test(""));
+
+
+function testInvalidRange(str) {
+  try {
+    RegExp(str).test("x");
+  } catch (e) {
+    return;
+  }
+  assetUnreachable("Allowed invalid range in " + str);
+}
+
+function testValidRange(str) {
+  try {
+    RegExp(str).test("x");
+  } catch (e) {
+    assertUnreachable("Shouldn't fail parsing: " + str + ", was: " + e);
+  }
+}
+
+testInvalidRange("[\\d-z]");
+testInvalidRange("[z-\\d]");
+testInvalidRange("[\\d-\\d]");
+testInvalidRange("[z-x]");  // Larger value first.
+testInvalidRange("[x-\\d-\\d]");
+
+testValidRange("[x-z]");
+testValidRange("[!--\d]");  // Second "-" is end of range.
+testValidRange("[\d-]");
+testValidRange("[-\d]");
+testValidRange("[-\d-]");
+testValidRange("[^-\d-]");
+testValidRange("[^-\d-]");
+testValidRange("[0-9-\w]");
+
+// Escaped dashes do not count as range operators.
+testValidRange("[\\d\\-z]");
+testValidRange("[z\\-\\d]");
+testValidRange("[\\d\\-\\d]");
+testValidRange("[z\\-x]");
+testValidRange("[x\\-\\d\\-\\d]");
+
+
+
+
=======================================
--- /branches/bleeding_edge/test/mjsunit/third_party/regexp-pcre.js Tue Sep 15 04:51:40 2009 +++ /branches/bleeding_edge/test/mjsunit/third_party/regexp-pcre.js Thu Dec 2 00:02:37 2010
@@ -962,7 +962,7 @@
 res[883] = /[a\-z]+/;
 res[884] = /[a-z]+/;
 res[885] = /[\d-]+/;
-res[886] = /[\d-z]+/;
+// res[886] - Disabled after making [\d-z] invalid to be compatible with JSC.
 res[887] = /\x5c/;
 res[888] = /\x20Z/;
 res[889] = /ab{3cd/;
@@ -1346,7 +1346,7 @@
 res[1267] = /(Z()|A)*/;
 res[1268] = /(Z(())|A)*/;
 res[1269] = /a*/g;
-res[1270] = /^[\d-a]/;
+// res[1270] disabled after making /^[\d-a]/ invalid to be compatible with JSC.
 res[1271] = /[[:space:]]+/;
 res[1272] = /[[:blank:]]+/;
 res[1273] = /[\s]+/;
@@ -2530,7 +2530,7 @@
 assertEquals(null, res[431].exec("a\x85b", 883));
 assertThrows("var re = /(?-+a)/;", 884);
 assertEquals(null, res[443].exec("aaaa", 885));
-assertEquals(null, res[443].exec("bacxxx", 886));
+// assertEquals(null, res[443].exec("bacxxx", 886));
 assertEquals(null, res[443].exec("bbaccxxx ", 887));
 assertEquals(null, res[443].exec("bbbacccxx", 888));
 assertEquals(null, res[443].exec("aaaa", 889));
@@ -4391,9 +4391,10 @@
 assertEquals("12-34", res[885].exec("12-34"), 2744);
 assertEquals(null, res[885].exec("*** Failers", 2745));
 assertEquals(null, res[885].exec("aaa", 2746));
-assertEquals("12-34z", res[886].exec("12-34z"), 2747);
-assertEquals(null, res[886].exec("*** Failers", 2748));
-assertEquals(null, res[886].exec("aaa", 2749));
+// Disabled. To be compatible with JSC, the regexp is no longer valid.
+// assertEquals("12-34z", res[886].exec("12-34z"), 2747);
+// assertEquals(null, res[886].exec("*** Failers", 2748));
+// assertEquals(null, res[886].exec("aaa", 2749));
 assertEquals("\\", res[887].exec("\\\\"), 2750);
 assertEquals(" Z", res[888].exec("the Zoo"), 2751);
 assertEquals(null, res[888].exec("*** Failers", 2752));
@@ -5355,11 +5356,12 @@
 assertEquals("", res[1269].exec("0digit"), 3708);
 assertEquals("", res[1269].exec("*** Failers"), 3709);
 assertEquals("", res[1269].exec("bcdef    "), 3710);
-assertEquals("a", res[1270].exec("abcde"), 3711);
-assertEquals("-", res[1270].exec("-things"), 3712);
-assertEquals("0", res[1270].exec("0digit"), 3713);
-assertEquals(null, res[1270].exec("*** Failers", 3714));
-assertEquals(null, res[1270].exec("bcdef    ", 3715));
+// Disabled. To be compatible with JSC, the RegExp is no longer valid.
+// assertEquals("a", res[1270].exec("abcde"), 3711);
+// assertEquals("-", res[1270].exec("-things"), 3712);
+// assertEquals("0", res[1270].exec("0digit"), 3713);
+// assertEquals(null, res[1270].exec("*** Failers", 3714));
+// assertEquals(null, res[1270].exec("bcdef    ", 3715));
 assertEquals(null, res[1271].exec("> \x09\n\x0c\x0d\x0b<", 3716));
 assertEquals(null, res[1271].exec(" ", 3717));
 assertEquals(null, res[1272].exec("> \x09\n\x0c\x0d\x0b<", 3718));
=======================================
--- /branches/bleeding_edge/test/mozilla/mozilla.status Tue Aug 31 00:31:25 2010 +++ /branches/bleeding_edge/test/mozilla/mozilla.status Thu Dec 2 00:02:37 2010
@@ -288,6 +288,10 @@
 js1_2/regexp/beginLine: FAIL_OK
 js1_2/regexp/endLine: FAIL_OK

+# To be compatible with JSC, we no longer accept [\d-x], [x-\d] or
+# [\d-\d] as valid ranges.
+ecma_3/RegExp/regress-375715-02: FAIL
+js1_5/extensions/regress-351463-01: FAIL

 # To be compatible with safari typeof a regexp yields 'function';
 # in firefox it yields 'object'.

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to